Merge from trunk

This commit is contained in:
Charles Haley 2012-04-28 14:10:32 +02:00
commit ee82fdac05
213 changed files with 58766 additions and 45210 deletions

View File

@ -19,6 +19,158 @@
# new recipes:
# - title:
- version: 0.8.49
date: 2012-04-27
new features:
- title: "Experimental support for generating Amazon's new KF8 format MOBI files"
description: "calibre can now generate Amazon's new KF8 format MOBI files.
To turn on this feature, go to Preferences->Tweaks and click Plugin Tweaks. In the box add:
test_mobi_output_type = 'both'
calibre will now produce MOBI files that have both the old MOBI format and the new KF8 format in them.
To learn more about KF8, see: http://www.amazon.com/gp/feature.html?docId=1000729511
Note that calibre support for KF8 is still experimental and there will likely be bugs."
- title: "Upgrade to using cssutils 0.9.9 for CSS parsing. Improved speed and robustness."
- title: "Show cover size in a tooltip in the conversion dialog"
tickets: [986958]
- title: "Driver for Nook Simple Touch with Glow Light"
tickets: [989264]
bug fixes:
- title: "Heuristics: When italicizing words do not operate on words not in between HTML tags."
tickets: [986298]
- title: "Fix (I hope) the bulk metadata download process crashing for some people on OS X when clicking the Yes button to apply the updates."
tickets: [986658]
- title: "Fix tooltip not being updated in the book details panel when pasting in a new cover"
tickets: [986958]
- title: "Cover Browser: Wrap the title on space only, not in between words."
tickets: [986516]
- title: "Edit metadata dialog: If a permission denied error occurs when clicking the next or prev buttons, stay on the current book."
tickets: [986903]
- title: "Fix heuristics not removing unnecessary hyphens from the end of lines."
tickets: [822744]
improved recipes:
- Metro Nieuws NL
- Der Tagesspiegel
new recipes:
- title: Berria
author: Alayn Gortazar
- title: Sol Haber
author: Onur Gungor
- title: Telam
author: Darko Miletic
- title: Richmond Times-Dispatch
author: jde
- version: 0.8.48
date: 2012-04-20
new features:
- title: "Conversion: The search and replace feature has been completely revamped."
description: "You can now use any number of search and replace
expression, not just three. You can also store and load frequently used
sets of search and replace expressions. Also, the wizard generates its
preview in a separate process to protect against crashes/memory leaks."
tickets: [983476,983484,983478]
- title: "Support for the new '.azw3' files that Amazon recently started generating. calibre will now detect them as ebooks. It can also view/convert them, if they are DRM free."
- title: "Drivers for Samsung Galaxy ACE GT-S5830L and HTC One X"
tickets: [981185]
bug fixes:
- title: "Get Books: Support the new website design of Barnes & Noble"
- title: "T1 driver: Fix books sent to SD card sometimes resulting problems when deleted."
tickets: [943586]
- title: "Do not allow author names to be set to blank via the Manage authors function. Blank authors are now automatically set to 'Unknown'"
- title: "MOBI Output: Handle background color specified on <td> and <tr> in addition to <table> tags."
tickets: [980813]
- title: "MOBI Output: Fix underline style applied to parent element not getting inherited by <a> children."
tickets: [985711]
improved recipes:
- xkcd
- Metro Nieuws
- Calgary Herald
- Orlando Sentinel
- countryfile
- Heise
new recipes:
- title: Various new Polish news sources
author: fenuks
- title: Various Italian news sources
author: faber1971
- title: Jakarta Globe
author: rty
- title: Acim Bilim Dergisi
author: thomass
- version: 0.8.47
date: 2012-04-13
new features:
- title: "Conversion pipeline: Add support for all the named entities in the HTML 5 spec."
tickets: [976056]
- title: "Support for viewing and converting the Haodoo PDB ebook format"
tickets: [976478]
- title: "Device driver for Laser EB720"
bug fixes:
- title: "Fix regression in automatic adding in 0.8.46 that broke automatic adding if adding of duplicates is enabled and auto convert is also enabled"
tickets: [976336]
- title: 'Fix "Tags" field in advanced search does not obey regex setting'
tickets: [980221]
- title: "EPUB Input: Automatically extract cover image from simple HTML title page that consists of only a single <img> tag, instead of rendering the page"
- title: "Prevent errors when both author and author_sort are used in a template for reading metadata from filenames for files on a device"
- title: "Amazon metadata download: Handle books whose titles start with a bracket."
tickets: [976365]
- title: "Get Books: Fix downloading of purchased books from Baen"
tickets: [975929]
improved recipes:
- Forbes
- Caros Amigos
- Trouw
- Sun UK
- Metro
- Daily Mirror
new recipes:
- title: "Melbourne Herald Sun"
author: Ray Hartley
- title: "Editoriali and Zerocalcare"
author: faber1971
- version: 0.8.46
date: 2012-04-06

View File

@ -0,0 +1,27 @@
# -*- coding: utf-8 -*-
from calibre.web.feeds.news import BasicNewsRecipe
class AdvancedUserRecipe1334868409(BasicNewsRecipe):
title = u'AÇIK BİLİM DERGİSİ'
description = ' Aylık çevrimiçi bilim dergisi'
__author__ = u'thomass'
oldest_article = 30
max_articles_per_feed = 300
auto_cleanup = True
encoding = 'UTF-8'
publisher = 'açık bilim'
category = 'haber, bilim,TR,dergi'
language = 'tr'
publication_type = 'magazine '
conversion_options = {
'tags' : category
,'language' : language
,'publisher' : publisher
,'linearize_tables': True
}
cover_img_url = 'http://www.acikbilim.com/wp-content/themes/Equilibrium/images/logodene.jpg'
masthead_url = 'http://www.acikbilim.com/wp-content/themes/Equilibrium/images/logodene.jpg'
feeds = [(u'Tüm Yayınlar', u'http://www.acikbilim.com/feed')]

View File

@ -9,6 +9,7 @@ class Adventure_zone(BasicNewsRecipe):
no_stylesheets = True
oldest_article = 20
max_articles_per_feed = 100
index='http://www.adventure-zone.info/fusion/'
use_embedded_content=False
preprocess_regexps = [(re.compile(r"<td class='capmain'>Komentarze</td>", re.IGNORECASE), lambda m: '')]
remove_tags_before= dict(name='td', attrs={'class':'main-bg'})
@ -45,6 +46,19 @@ class Adventure_zone(BasicNewsRecipe):
skip_tag = skip_tag.findAll(name='a')
for r in skip_tag:
if r.strong:
word=r.strong.string
if word and (('zapowied' in word) or ('recenzj' in word) or ('solucj' in word)):
return self.index_to_soup('http://www.adventure-zone.info/fusion/print.php?type=A&item'+r['href'][r['href'].find('article_id')+7:], raw=True)
word=r.strong.string.lower()
if word and (('zapowied' in word) or ('recenzj' in word) or ('solucj' in word) or ('poradnik' in word)):
return self.index_to_soup('http://www.adventure-zone.info/fusion/print.php?type=A&item'+r['href'][r['href'].find('article_id')+7:], raw=True)
def preprocess_html(self, soup):
footer=soup.find(attrs={'class':'news-footer middle-border'})
if footer and len(footer('a'))>=2:
footer('a')[1].extract()
for item in soup.findAll(style=True):
del item['style']
for a in soup('a'):
if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']:
a['href']=self.index + a['href']
return soup

View File

@ -68,4 +68,7 @@ class Benchmark_pl(BasicNewsRecipe):
self.image_article(soup, soup.body)
else:
self.append_page(soup, soup.body)
for a in soup('a'):
if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']:
a['href']=self.INDEX + a['href']
return soup

44
recipes/berria.recipe Normal file
View File

@ -0,0 +1,44 @@
__license__ = 'GPL v3'
__copyright__ = '2012, Alayn Gortazar <zutoin at gmail dot com>'
'''
www.berria.info
'''
from calibre.web.feeds.news import BasicNewsRecipe
class Berria(BasicNewsRecipe):
title = 'Berria'
__author__ = 'Alayn Gortazar'
description = 'Euskal Herriko euskarazko egunkaria'
publisher = 'Berria'
category = 'news, politics, sports, Basque Country'
oldest_article = 2
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
language = 'eu'
remove_empty_feeds = True
masthead_url = 'http://upload.wikimedia.org/wikipedia/commons/thumb/6/6a/Berria_Logo.svg/400px-Berria_Logo.svg.png'
keep_only_tags = [
dict(id='goiburua'),
dict(name='div', attrs={'class':['ber_ikus']}),
dict(name='section', attrs={'class':'ber_ikus'})
]
remove_tags = [
dict(name='a', attrs={'class':'iruzkinak'}),
dict(name='div', attrs={'class':'laguntzaileak'})
]
extra_css = '#goiburua{font-weight: bold} .zintiloa{font-size: small} .sarrera{color:#666} .titularra{font-size: x-large} .sarrera{font-weight: bold} .argazoin{color:#666; font-size: small}'
feeds = [
(u'Edizioa jarraia', u'http://berria.info/rss/ediziojarraia.xml'),
(u'Iritzia', u'http://berria.info/rss/iritzia.xml'),
(u'Euskal Herria', u'http://berria.info/rss/euskalherria.xml'),
(u'Ekonomia', u'http://berria.info/rss/ekonomia.xml'),
(u'Mundua', u'http://berria.info/rss/mundua.xml'),
(u'Kirola', u'http://berria.info/rss/kirola.xml'),
(u'Plaza', u'http://berria.info/rss/plaza.xml')
]

View File

@ -1,220 +1,35 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
__license__ = 'GPL v3'
'''
www.canada.com
'''
import re
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup
class CanWestPaper(BasicNewsRecipe):
# un-comment the following four lines for the Victoria Times Colonist
## title = u'Victoria Times Colonist'
## url_prefix = 'http://www.timescolonist.com'
## description = u'News from Victoria, BC'
## fp_tag = 'CAN_TC'
# un-comment the following four lines for the Vancouver Province
## title = u'Vancouver Province'
## url_prefix = 'http://www.theprovince.com'
## description = u'News from Vancouver, BC'
## fp_tag = 'CAN_VP'
# un-comment the following four lines for the Vancouver Sun
## title = u'Vancouver Sun'
## url_prefix = 'http://www.vancouversun.com'
## description = u'News from Vancouver, BC'
## fp_tag = 'CAN_VS'
# un-comment the following four lines for the Edmonton Journal
## title = u'Edmonton Journal'
## url_prefix = 'http://www.edmontonjournal.com'
## description = u'News from Edmonton, AB'
## fp_tag = 'CAN_EJ'
# un-comment the following four lines for the Calgary Herald
title = u'Calgary Herald'
url_prefix = 'http://www.calgaryherald.com'
description = u'News from Calgary, AB'
fp_tag = 'CAN_CH'
# un-comment the following four lines for the Regina Leader-Post
## title = u'Regina Leader-Post'
## url_prefix = 'http://www.leaderpost.com'
## description = u'News from Regina, SK'
## fp_tag = ''
# un-comment the following four lines for the Saskatoon Star-Phoenix
## title = u'Saskatoon Star-Phoenix'
## url_prefix = 'http://www.thestarphoenix.com'
## description = u'News from Saskatoon, SK'
## fp_tag = ''
# un-comment the following four lines for the Windsor Star
## title = u'Windsor Star'
## url_prefix = 'http://www.windsorstar.com'
## description = u'News from Windsor, ON'
## fp_tag = 'CAN_'
# un-comment the following four lines for the Ottawa Citizen
## title = u'Ottawa Citizen'
## url_prefix = 'http://www.ottawacitizen.com'
## description = u'News from Ottawa, ON'
## fp_tag = 'CAN_OC'
# un-comment the following four lines for the Montreal Gazette
## title = u'Montreal Gazette'
## url_prefix = 'http://www.montrealgazette.com'
## description = u'News from Montreal, QC'
## fp_tag = 'CAN_MG'
language = 'en_CA'
__author__ = 'Nick Redding'
no_stylesheets = True
timefmt = ' [%b %d]'
extra_css = '''
.timestamp { font-size:xx-small; display: block; }
#storyheader { font-size: medium; }
#storyheader h1 { font-size: x-large; }
#storyheader h2 { font-size: large; font-style: italic; }
.byline { font-size:xx-small; }
#photocaption { font-size: small; font-style: italic }
#photocredit { font-size: xx-small; }'''
keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'})]
remove_tags = [{'class':'comments'},
dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}),
dict(name='div', attrs={'class':'viewmore'}),dict(name='li', attrs={'class':'email'}),
dict(name='div', attrs={'class':'story_tool_hr'}),dict(name='div', attrs={'class':'clear'}),
dict(name='div', attrs={'class':'story_tool'}),dict(name='div', attrs={'class':'copyright'}),
dict(name='div', attrs={'class':'rule_grey_solid'}),
dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})]
def get_cover_url(self):
from datetime import timedelta, date
if self.fp_tag=='':
return None
cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str(date.today().day)+'/lg/'+self.fp_tag+'.jpg'
br = BasicNewsRecipe.get_browser()
daysback=1
try:
br.open(cover)
except:
while daysback<7:
cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str((date.today() - timedelta(days=daysback)).day)+'/lg/'+self.fp_tag+'.jpg'
br = BasicNewsRecipe.get_browser()
try:
br.open(cover)
except:
daysback = daysback+1
continue
break
if daysback==7:
self.log("\nCover unavailable")
cover = None
return cover
def fixChars(self,string):
# Replace lsquo (\x91)
fixed = re.sub("\x91","",string)
# Replace rsquo (\x92)
fixed = re.sub("\x92","",fixed)
# Replace ldquo (\x93)
fixed = re.sub("\x93","“",fixed)
# Replace rdquo (\x94)
fixed = re.sub("\x94","”",fixed)
# Replace ndash (\x96)
fixed = re.sub("\x96","",fixed)
# Replace mdash (\x97)
fixed = re.sub("\x97","—",fixed)
fixed = re.sub("&#x2019;","",fixed)
return fixed
def massageNCXText(self, description):
# Kindle TOC descriptions won't render certain characters
if description:
massaged = unicode(BeautifulStoneSoup(description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES))
# Replace '&' with '&'
massaged = re.sub("&","&", massaged)
return self.fixChars(massaged)
else:
return description
def populate_article_metadata(self, article, soup, first):
if first:
picdiv = soup.find('body').find('img')
if picdiv is not None:
self.add_toc_thumbnail(article,re.sub(r'links\\link\d+\\','',picdiv['src']))
xtitle = article.text_summary.strip()
if len(xtitle) == 0:
desc = soup.find('meta',attrs={'property':'og:description'})
if desc is not None:
article.summary = article.text_summary = desc['content']
def strip_anchors(self,soup):
paras = soup.findAll(True)
for para in paras:
aTags = para.findAll('a')
for a in aTags:
if a.img is None:
a.replaceWith(a.renderContents().decode('cp1252','replace'))
return soup
def preprocess_html(self, soup):
return self.strip_anchors(soup)
def parse_index(self):
soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html')
articles = {}
key = 'News'
ans = ['News']
# Find each instance of class="sectiontitle", class="featurecontent"
for divtag in soup.findAll('div',attrs={'class' : ["section_title02","featurecontent"]}):
#self.log(" div class = %s" % divtag['class'])
if divtag['class'].startswith('section_title'):
# div contains section title
if not divtag.h3:
continue
key = self.tag_to_string(divtag.h3,False)
ans.append(key)
self.log("Section name %s" % key)
continue
# div contains article data
h1tag = divtag.find('h1')
if not h1tag:
continue
atag = h1tag.find('a',href=True)
if not atag:
continue
url = self.url_prefix+'/news/todays-paper/'+atag['href']
#self.log("Section %s" % key)
#self.log("url %s" % url)
title = self.tag_to_string(atag,False)
#self.log("title %s" % title)
pubdate = ''
description = ''
ptag = divtag.find('p');
if ptag:
description = self.tag_to_string(ptag,False)
#self.log("description %s" % description)
author = ''
autag = divtag.find('h4')
if autag:
author = self.tag_to_string(autag,False)
#self.log("author %s" % author)
if not articles.has_key(key):
articles[key] = []
articles[key].append(dict(title=title,url=url,date=pubdate,description=description,author=author,content=''))
ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
return ans
from calibre.web.feeds.news import BasicNewsRecipe
class CalgaryHerald(BasicNewsRecipe):
title = u'Calgary Herald'
oldest_article = 3
max_articles_per_feed = 100
feeds = [
(u'News', u'http://rss.canada.com/get/?F233'),
(u'Calgary', u'http://www.calgaryherald.com/scripts/sp6query.aspx?catalog=cahr&tags=keyword|calgary&output=rss?link=http%3a%2f%2fwww.calgaryherald'),
(u'Alberta', u'http://www.calgaryherald.com/scripts/Sp6Query.aspx?catalog=CAHR&tags=Keyword|Alberta&output=rss?link=http%3A%2F%2Fwww.calgaryherald.com%2Fnews%2Falberta%2Findex.html'),
(u'Politics', u'http://rss.canada.com/get/?F7551'),
(u'National', u'http://rss.canada.com/get/?F7552'),
(u'World', u'http://rss.canada.com/get/?F7553'),
]
__author__ = 'rty'
pubisher = 'Calgary Herald'
description = 'Calgary, Alberta, Canada'
category = 'News, Calgary, Alberta, Canada'
remove_javascript = True
use_embedded_content = False
no_stylesheets = True
language = 'en_CA'
encoding = 'utf-8'
conversion_options = {'linearize_tables':True}
##masthead_url = 'http://www.calgaryherald.com/index.html'
keep_only_tags = [
dict(name='div', attrs={'id':'storyheader'}),
dict(name='div', attrs={'id':'storycontent'})
]
remove_tags_after = {'class':"story_tool_hr"}

View File

@ -0,0 +1,17 @@
from calibre.web.feeds.news import BasicNewsRecipe
class AdvancedUserRecipe1331729727(BasicNewsRecipe):
title = u'Camera di Commercio di Bari'
oldest_article = 7
__author__ = 'faber1971'
description = 'News from the Chamber of Commerce of Bari'
language = 'it'
max_articles_per_feed = 100
auto_cleanup = True
masthead_url = 'http://www.ba.camcom.it/grafica/layout-bordo/logo_camcom_bari.png'
feeds = [(u'Camera di Commercio di Bari', u'http://feed43.com/4715147488845101.xml')]
__license__ = 'GPL v3'
__copyright__ = '2012, faber1971'
__version__ = 'v1.00'
__date__ = '17, April 2012'

View File

@ -6,6 +6,7 @@ class CD_Action(BasicNewsRecipe):
description = 'cdaction.pl - polish games magazine site'
category = 'games'
language = 'pl'
index='http://www.cdaction.pl'
oldest_article = 8
max_articles_per_feed = 100
no_stylesheets= True
@ -17,4 +18,10 @@ class CD_Action(BasicNewsRecipe):
def get_cover_url(self):
soup = self.index_to_soup('http://www.cdaction.pl/magazyn/')
self.cover_url='http://www.cdaction.pl'+ soup.find(id='wspolnik').div.a['href']
return getattr(self, 'cover_url', self.cover_url)
return getattr(self, 'cover_url', self.cover_url)
def preprocess_html(self, soup):
for a in soup('a'):
if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']:
a['href']=self.index + a['href']
return soup

View File

@ -1,11 +1,12 @@
from calibre import browser
from calibre.web.feeds.news import BasicNewsRecipe
class AdvancedUserRecipe1325006965(BasicNewsRecipe):
title = u'Countryfile.com'
cover_url = 'http://www.buysubscriptions.com/static_content/the-immediate/en/images/covers/CFIL_maxi.jpg'
#cover_url = 'http://www.countryfile.com/sites/default/files/imagecache/160px_wide/cover/2_1.jpg'
__author__ = 'Dave Asbury'
description = 'The official website of Countryfile Magazine'
# last updated 29/1/12
# last updated 15/4/12
language = 'en_GB'
oldest_article = 30
max_articles_per_feed = 25
@ -13,7 +14,23 @@ class AdvancedUserRecipe1325006965(BasicNewsRecipe):
no_stylesheets = True
auto_cleanup = True
#articles_are_obfuscated = True
def get_cover_url(self):
soup = self.index_to_soup('http://www.countryfile.com/')
cov = soup.find(attrs={'class' : 'imagecache imagecache-160px_wide imagecache-linked imagecache-160px_wide_linked'})
#print '******** ',cov,' ***'
cov2 = str(cov)
cov2=cov2[124:-90]
#print '******** ',cov2,' ***'
# try to get cover - if can't get known cover
br = browser()
br.set_handle_redirect(False)
try:
br.open_novisit(cov2)
cover_url = cov2
except:
cover_url = 'http://www.countryfile.com/sites/default/files/imagecache/160px_wide/cover/2_1.jpg'
return cover_url
remove_tags = [
# dict(attrs={'class' : ['player']}),

View File

@ -1,20 +1,21 @@
from calibre.web.feeds.news import BasicNewsRecipe
import re
import mechanize
class AdvancedUserRecipe1306061239(BasicNewsRecipe):
title = u'The Daily Mirror'
description = 'News as provide by The Daily Mirror -UK'
__author__ = 'Dave Asbury'
# last updated 11/2/12
# last updated 7/4/12
language = 'en_GB'
cover_url = 'http://yookeo.com/screens/m/i/mirror.co.uk.jpg'
#cover_url = 'http://yookeo.com/screens/m/i/mirror.co.uk.jpg'
masthead_url = 'http://www.nmauk.co.uk/nma/images/daily_mirror.gif'
oldest_article = 1
max_articles_per_feed = 5
max_articles_per_feed = 10
remove_empty_feeds = True
remove_javascript = True
no_stylesheets = True
@ -75,3 +76,28 @@ class AdvancedUserRecipe1306061239(BasicNewsRecipe):
img { display:block}
'''
def get_cover_url(self):
soup = self.index_to_soup('http://www.politicshome.com/uk/latest_frontpage.html')
# look for the block containing the mirror button and url
cov = soup.find(attrs={'style' : 'background-image: url(http://www.politicshome.com/images/sources/source_frontpage_button_92.gif);'})
cov2 = str(cov)
cov2='http://www.politicshome.com'+cov2[9:-142]
#cov2 now contains url of the page containing pic
soup = self.index_to_soup(cov2)
cov = soup.find(attrs={'id' : 'large'})
cov2 = str(cov)
cov2=cov2[27:-18]
#cov2 now is pic url, now go back to original function
br = mechanize.Browser()
br.set_handle_redirect(False)
try:
br.open_novisit(cov2)
cover_url = cov2
except:
cover_url = 'http://yookeo.com/screens/m/i/mirror.co.uk.jpg'
#cover_url = cov2
#cover_url = 'http://www.thesun.co.uk/img/global/new-masthead-logo.png'
return cover_url

View File

@ -11,6 +11,7 @@ class Dobreprogramy_pl(BasicNewsRecipe):
cover_url = 'http://userlogos.org/files/logos/Karmody/dobreprogramy_01.png'
description = u'Aktualności i blogi z dobreprogramy.pl'
encoding = 'utf-8'
index='http://www.dobreprogramy.pl/'
no_stylesheets = True
language = 'pl'
extra_css = '.title {font-size:22px;}'
@ -22,3 +23,10 @@ class Dobreprogramy_pl(BasicNewsRecipe):
#remove_tags = [dict(name='div', attrs={'class':['komentarze', 'block', 'portalInfo', 'menuBar', 'topBar']})]
feeds = [(u'Aktualności', 'http://feeds.feedburner.com/dobreprogramy/Aktualnosci'),
('Blogi', 'http://feeds.feedburner.com/dobreprogramy/BlogCzytelnikow')]
def preprocess_html(self, soup):
for a in soup('a'):
if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']:
a['href']=self.index + a['href']
return soup

View File

@ -7,6 +7,7 @@ class Dzieje(BasicNewsRecipe):
cover_url = 'http://www.dzieje.pl/sites/default/files/dzieje_logo.png'
category = 'history'
language = 'pl'
index='http://dzieje.pl'
oldest_article = 8
max_articles_per_feed = 100
remove_javascript=True
@ -15,3 +16,10 @@ class Dzieje(BasicNewsRecipe):
remove_tags_after= dict(id='dogory')
remove_tags=[dict(id='dogory')]
feeds = [(u'Dzieje', u'http://dzieje.pl/rss.xml')]
def preprocess_html(self, soup):
for a in soup('a'):
if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']:
a['href']=self.index + a['href']
return soup

View File

@ -21,3 +21,8 @@ class eioba(BasicNewsRecipe):
(u'Rozrywka', u'http://www.eioba.pl/feed/categories/10.xml'),
(u'Rożne', u'http://www.eioba.pl/feed/categories/9.xml')
]
def preprocess_html(self, soup):
for item in soup.findAll(style=True):
del item['style']
return soup

View File

@ -7,6 +7,7 @@ class eMuzyka(BasicNewsRecipe):
description = u'Emuzyka to największa i najpopularniejsza strona o muzyce w Polsce'
category = 'music'
language = 'pl'
index='http://www.emuzyka.pl'
cover_url='http://s.emuzyka.pl/img/emuzyka_invert_small.jpg'
no_stylesheets = True
oldest_article = 7
@ -14,3 +15,9 @@ class eMuzyka(BasicNewsRecipe):
keep_only_tags=[dict(name='div', attrs={'id':'news_container'}), dict(name='h3'), dict(name='div', attrs={'class':'review_text'})]
remove_tags=[dict(name='span', attrs={'id':'date'})]
feeds = [(u'Aktualno\u015bci', u'http://www.emuzyka.pl/rss.php?f=1'), (u'Recenzje', u'http://www.emuzyka.pl/rss.php?f=2')]
def preprocess_html(self, soup):
for a in soup('a'):
if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']:
a['href']=self.index + a['href']
return soup

View File

@ -7,7 +7,7 @@ class AdvancedUserRecipe1325006965(BasicNewsRecipe):
# cover_url = 'http://profile.ak.fbcdn.net/hprofile-ak-snc4/373529_38324934806_64930243_n.jpg'
masthead_url = 'http://www.fhm.com/App_Resources/Images/Site/re-design/logo.gif'
__author__ = 'Dave Asbury'
# last updated 17/3/12
# last updated 14/4/12
language = 'en_GB'
oldest_article = 28
max_articles_per_feed = 12
@ -28,7 +28,8 @@ class AdvancedUserRecipe1325006965(BasicNewsRecipe):
#]
feeds = [
(u'From the Homepage',u'http://feed43.com/8053226782885416.xml'),
(u'From the Homepage',u'http://feed43.com/0032328550253453.xml'),
#http://feed43.com/8053226782885416.xml'),
(u'Funny - The Very Best Of The Internet',u'http://feed43.com/4538510106331565.xml'),
(u'Upgrade',u'http://feed43.com/0877305847443234.xml'),
#(u'The Final Countdown', u'http://feed43.com/3576106158530118.xml'),

View File

@ -7,6 +7,7 @@ class Filmweb_pl(BasicNewsRecipe):
cover_url = 'http://userlogos.org/files/logos/crudus/filmweb.png'
category = 'movies'
language = 'pl'
index='http://www.filmweb.pl'
oldest_article = 8
max_articles_per_feed = 100
no_stylesheets= True
@ -39,3 +40,9 @@ class Filmweb_pl(BasicNewsRecipe):
self.log.warn(skip_tag)
return self.index_to_soup(skip_tag['href'], raw=True)
def preprocess_html(self, soup):
for a in soup('a'):
if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']:
a['href']=self.index + a['href']
return soup

View File

@ -1,39 +1,49 @@
from calibre.ebooks.BeautifulSoup import BeautifulSoup
import re
from calibre.web.feeds.news import BasicNewsRecipe
class Forbes(BasicNewsRecipe):
title = u'Forbes'
description = 'Business and Financial News'
__author__ = 'Darko Miletic'
__author__ = 'Kovid Goyal'
oldest_article = 30
max_articles_per_feed = 100
max_articles_per_feed = 20
language = 'en'
encoding = 'utf-8'
recursions = 1
no_stylesheets = True
html2lrf_options = ['--base-font-size', '10']
cover_url = u'http://www.forbes.com/media/current_covers/forbes_120_160.gif'
feeds = [(u'Latest', u'http://www.forbes.com/news/index.xml'),
(u'Most Popular', u'http://www.forbes.com/feeds/popstories.xml'),
(u'Most Emailed', u'http://www.forbes.com/feeds/mostemailed.xml'),
(u'Faces', u'http://www.forbes.com/facesscan/index.xml'),
(u'Technology', u'http://www.forbes.com/technology/index.xml'),
(u'Personal Tech', u'http://www.forbes.com/personaltech/index.xml'),
(u'Wireless', u'http://www.forbes.com/wireless/index.xml'),
(u'Business', u'http://www.forbes.com/business/index.xml'),
(u'Sports Money', u'http://www.forbes.com/sportsmoney/index.xml'),
(u'Sports', u'http://www.forbes.com/forbeslife/sports/index.xml'),
(u'Vehicles', u'http://www.forbes.com/forbeslife/vehicles/index.xml'),
(u'Leadership', u'http://www.forbes.com/leadership/index.xml'),
(u'Careers', u'http://www.forbes.com/leadership/careers/index.xml'),
(u'Compensation', u'http://www.forbes.com/leadership/compensation/index.xml'),
(u'Managing', u'http://www.forbes.com/leadership/managing/index.xml')]
def print_version(self, url):
raw = self.browser.open(url).read()
soup = BeautifulSoup(raw.decode('latin1', 'replace'))
print_link = soup.find('a', {'onclick':"s_linkTrackVars='prop18';s_linkType='o';s_linkName='Print';if(typeof(globalPageName)!='undefined')s_prop18=globalPageName;s_lnk=s_co(this);s_gs(s_account);"})
if print_link is None:
return ''
return 'http://www.forbes.com' + print_link['href']
feeds = [(u'Latest', u'http://www.forbes.com/news/index.xml'),
(u'Most Popular', u'http://www.forbes.com/feeds/popstories.xml'),
(u'Technology', u'http://www.forbes.com/technology/index.xml'),
(u'Business', u'http://www.forbes.com/business/index.xml'),
(u'Sports Money', u'http://www.forbes.com/sportsmoney/index.xml'),
(u'Leadership', u'http://www.forbes.com/leadership/index.xml'),]
keep_only_tags = \
{'class':lambda x: x and (set(x.split()) & {'body', 'pagination',
'articleHead', 'article_head'})}
remove_tags_before = {'name':'h1'}
remove_tags = [
{'class':['comment_bug', 'engagement_block',
'video_promo_block', 'article_actions']},
{'id':'comments'}
]
def is_link_wanted(self, url, tag):
ans = re.match(r'http://.*/[2-9]/', url) is not None
if ans:
self.log('Following multipage link: %s'%url)
return ans
def postprocess_html(self, soup, first_fetch):
for pag in soup.findAll(True, 'pagination'):
pag.extract()
if not first_fetch:
h1 = soup.find('h1')
if h1 is not None:
h1.extract()
return soup

View File

@ -0,0 +1,16 @@
from calibre.web.feeds.news import BasicNewsRecipe
class Fotoblogia_pl(BasicNewsRecipe):
title = u'Fotoblogia.pl'
__author__ = 'fenuks'
category = 'photography'
language = 'pl'
masthead_url = 'http://img.interia.pl/komputery/nimg/u/0/fotoblogia21.jpg'
cover_url= 'http://fotoblogia.pl/images/2009/03/fotoblogia2.jpg'
oldest_article = 7
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
keep_only_tags=[dict(name='div', attrs={'class':'post-view post-standard'})]
remove_tags=[dict(attrs={'class':['external fotoblogia', 'categories', 'tags']})]
feeds = [(u'Wszystko', u'http://fotoblogia.pl/feed/rss2')]

View File

@ -6,16 +6,24 @@ class Gameplay_pl(BasicNewsRecipe):
description = u'gameplay.pl - serwis o naszych zainteresowaniach, grach, filmach, książkach, muzyce, fotografii i konsolach.'
category = 'games, movies, books, music'
language = 'pl'
index='http://gameplay.pl'
masthead_url= 'http://gameplay.pl/img/gpy_top_logo.png'
cover_url= 'http://gameplay.pl/img/gpy_top_logo.png'
max_articles_per_feed = 100
remove_javascript= True
no_stylesheets= True
keep_only_tags=[dict(name='div', attrs={'class':['news_endpage_tit', 'news']})]
remove_tags=[dict(name='div', attrs={'class':['galeria', 'noedit center im']})]
remove_tags=[dict(name='div', attrs={'class':['galeria', 'noedit center im', 'news_list', 'news_list_autor', 'stop_bot', 'tagi']}), dict(attrs={'usemap':'#map'})]
feeds = [(u'Wiadomo\u015bci', u'http://gameplay.pl/rss/')]
def image_url_processor(self, baseurl, url):
if 'http' not in url:
return 'http://gameplay.pl'+ url[2:]
else:
return url
return url
def preprocess_html(self, soup):
for a in soup('a'):
if a.has_key('href') and '../' in a['href']:
a['href']=self.index + a['href'][2:]
return soup

View File

@ -9,6 +9,7 @@ class Gildia(BasicNewsRecipe):
language = 'pl'
oldest_article = 8
max_articles_per_feed = 100
remove_empty_feeds=True
no_stylesheets=True
remove_tags=[dict(name='div', attrs={'class':'backlink'}), dict(name='div', attrs={'class':'im_img'}), dict(name='div', attrs={'class':'addthis_toolbox addthis_default_style'})]
keep_only_tags=dict(name='div', attrs={'class':'widetext'})
@ -24,3 +25,16 @@ class Gildia(BasicNewsRecipe):
self.log.warn('odnosnik')
self.log.warn(link['href'])
return self.index_to_soup(link['href'], raw=True)
def preprocess_html(self, soup):
for a in soup('a'):
if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']:
if '/gry/' in a['href']:
a['href']='http://www.gry.gildia.pl' + a['href']
elif u'książk' in soup.title.string.lower() or u'komiks' in soup.title.string.lower():
a['href']='http://www.literatura.gildia.pl' + a['href']
elif u'komiks' in soup.title.string.lower():
a['href']='http://www.literatura.gildia.pl' + a['href']
else:
a['href']='http://www.gildia.pl' + a['href']
return soup

View File

@ -7,6 +7,7 @@ class Gram_pl(BasicNewsRecipe):
category = 'games'
language = 'pl'
oldest_article = 8
index='http://www.gram.pl'
max_articles_per_feed = 100
no_stylesheets= True
extra_css = 'h2 {font-style: italic; font-size:20px;} .picbox div {float: left;}'
@ -52,4 +53,7 @@ class Gram_pl(BasicNewsRecipe):
tag=soup.findAll(name='div', attrs={'class':'picbox'})
for t in tag:
t['style']='float: left;'
for a in soup('a'):
if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']:
a['href']=self.index + a['href']
return soup

View File

@ -59,6 +59,7 @@ class heiseDe(BasicNewsRecipe):
dict(name='span', attrs={'class':'rsaquo'}),
dict(name='div', attrs={'class':'news_logo'}),
dict(name='div', attrs={'class':'bcadv ISI_IGNORE'}),
dict(name='div', attrs={'class':'navi_top_container'}),
dict(name='p', attrs={'class':'news_option'}),
dict(name='p', attrs={'class':'news_navi'}),
dict(name='div', attrs={'class':'news_foren'})]
@ -69,3 +70,5 @@ class heiseDe(BasicNewsRecipe):

View File

@ -0,0 +1,20 @@
from calibre.web.feeds.news import BasicNewsRecipe
class historia_news(BasicNewsRecipe):
title = u'historia-news'
__author__ = 'fenuks'
description = u'Historia-news to portal dla ludzi kochających historię. Najnowsze newsy z historii bliższej i dalszej, archeologii, paleontologii oraz ciekawostki i podcasty z historii kultury, sportu, motoryzacji i inne.'
masthead_url = 'http://historia-news.pl/templates/hajak4/images/header.jpg'
cover_url= 'http://www.historia-news.pl/templates/hajak4/images/header.jpg'
category = 'history'
language = 'pl'
oldest_article = 7
max_articles_per_feed = 100
no_stylesheets = True
remove_empty_feeds = True
remove_tags=[dict(name='form'), dict(name='img', attrs={'alt':'Print'}), dict(attrs={'class':['commbutt', 'cpr']}), dict(id=['plusone', 'facebook'])]
feeds = [(u'Wiadomo\u015bci', u'http://historia-news.pl/wiadomoci.feed?type=rss'), (u'Artyku\u0142y', u'http://historia-news.pl/artykuy.feed?type=rss')]
def print_version(self, url):
return url + '?tmpl=component&print=1&layout=default&page='

Binary file not shown.

After

Width:  |  Height:  |  Size: 379 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 833 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 1006 B

BIN
recipes/icons/telam.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.9 KiB

View File

@ -8,6 +8,7 @@ class in4(BasicNewsRecipe):
description = u'Serwis Informacyjny - Aktualnosci, recenzje'
category = 'IT'
language = 'pl'
index='http://www.in4.pl/'
#cover_url= 'http://www.in4.pl/recenzje/337/in4pl.jpg'
no_stylesheets = True
remove_empty_feeds = True
@ -39,6 +40,7 @@ class in4(BasicNewsRecipe):
def preprocess_html(self, soup):
self.append_page(soup, soup.body)
for a in soup('a'):
if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']:
a['href']=self.index + a['href']
return soup

View File

@ -8,6 +8,7 @@ class INFRA(BasicNewsRecipe):
description = u'Serwis Informacyjny INFRA - UFO, Zjawiska Paranormalne, Duchy, Tajemnice świata.'
cover_url = 'http://npn.nazwa.pl/templates/ja_teline_ii/images/logo.jpg'
category = 'UFO'
index='http://infra.org.pl'
language = 'pl'
max_articles_per_feed = 100
no_stylesheers=True
@ -15,3 +16,11 @@ class INFRA(BasicNewsRecipe):
remove_tags_after=dict(attrs={'class':'pagenav'})
remove_tags=[dict(attrs={'class':'pagenav'})]
feeds = [(u'Najnowsze wiadomo\u015bci', u'http://www.infra.org.pl/index.php?option=com_rd_rss&id=1')]
def preprocess_html(self, soup):
for item in soup.findAll(style=True):
del item['style']
for a in soup('a'):
if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']:
a['href']=self.index + a['href']
return soup

View File

@ -0,0 +1,34 @@
from calibre.web.feeds.news import BasicNewsRecipe
class JakartaGlobe(BasicNewsRecipe):
title = u'Jakarta Globe'
oldest_article = 3
max_articles_per_feed = 100
feeds = [
(u'News', u'http://www.thejakartaglobe.com/pages/getrss/getrss-news.php'),
(u'Business', u'http://www.thejakartaglobe.com/pages/getrss/getrss-business.php'),
(u'Technology', u'http://www.thejakartaglobe.com/pages/getrss/getrss-tech.php'),
(u'My Jakarta', u'http://www.thejakartaglobe.com/pages/getrss/getrss-myjakarta.php'),
(u'International', u'http://www.thejakartaglobe.com/pages/getrss/getrss-international.php'),
(u'Life and Times', u'http://www.thejakartaglobe.com/pages/getrss/getrss-lifeandtimes.php'),
]
__author__ = 'rty'
pubisher = 'JakartaGlobe.com'
description = 'JakartaGlobe, Indonesia, Newspaper'
category = 'News, Indonesia'
remove_javascript = True
use_embedded_content = False
no_stylesheets = True
language = 'en_ID'
encoding = 'utf-8'
conversion_options = {'linearize_tables':True}
masthead_url = 'http://www.thejakartaglobe.com/pages/2010/images/jak-globe-logo.jpg'
keep_only_tags = [
dict(name='div', attrs={'class':'story'}),
dict(name='span', attrs={'class':'headline'}),
dict(name='div', attrs={'class':'story'}),
dict(name='p', attrs={'id':'bodytext'})
]

View File

@ -1,5 +1,6 @@
# -*- coding: utf-8 -*-
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup
class Konflikty(BasicNewsRecipe):
title = u'Konflikty Zbrojne'
@ -10,6 +11,23 @@ class Konflikty(BasicNewsRecipe):
category='military, history'
oldest_article = 7
max_articles_per_feed = 100
auto_cleanup = True
no_stylesheets = True
keep_only_tags=[dict(attrs={'class':['title1', 'image']}), dict(id='body')]
feeds = [(u'Aktualności', u'http://www.konflikty.pl/rss_aktualnosci_10.xml'), (u'Artyku\u0142y', u'http://www.konflikty.pl/rss_artykuly_10.xml'), (u'Relacje', u'http://www.konflikty.pl/rss_relacje_10.xml'), (u'Recenzje', u'http://www.konflikty.pl/rss_recenzje_10.xml')]
feeds = [(u'Aktualności', u'http://www.konflikty.pl/rss_aktualnosci_10.xml'),
(u'Artyku\u0142y', u'http://www.konflikty.pl/rss_artykuly_10.xml'),
(u'Historia', u'http://www.konflikty.pl/rss_historia_10.xml'),
(u'Militaria', u'http://www.konflikty.pl/rss_militaria_10.xml'),
(u'Relacje', u'http://www.konflikty.pl/rss_relacje_10.xml'),
(u'Recenzje', u'http://www.konflikty.pl/rss_recenzje_10.xml'),
(u'Teksty źródłowe', u'http://www.konflikty.pl/rss_tekstyzrodlowe_10.xml')]
def preprocess_html(self, soup):
for item in soup.findAll(style=True):
del item['style']
for image in soup.findAll(name='a', attrs={'class':'image'}):
if image.img and image.img.has_key('alt'):
image.name='div'
pos = len(image.contents)
image.insert(pos, BeautifulSoup('<p style="font-style:italic;">'+image.img['alt']+'</p>'))
return soup

View File

@ -0,0 +1,12 @@
from calibre.web.feeds.news import BasicNewsRecipe
class AdvancedUserRecipe1334649829(BasicNewsRecipe):
title = u'Liberatorio Politico'
oldest_article = 7
max_articles_per_feed = 100
auto_cleanup = True
masthead_url = 'http://liberatorio.altervista.org/wp-content/uploads/2012/01/Testata-LIBERATORIO-Altervista1.jpg'
feeds = [(u'Liberatorio Politico', u'http://liberatorio.altervista.org/feed/')]
__author__ = 'faber1971'
description = 'Inquiry journalism - a blog on Molfetta, Land of Bari, Apulia and Italy - v1.00 (07, April 2012)'
language = 'it'

50
recipes/limes.recipe Normal file
View File

@ -0,0 +1,50 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2012, faber1971'
__version__ = 'v1.00'
__date__ = '16, April 2012'
__description__ = 'Geopolitical Italian magazine'
from calibre.web.feeds.news import BasicNewsRecipe
class Limes(BasicNewsRecipe):
description = 'Italian weekly magazine'
__author__ = 'faber1971'
cover_url = 'http://temi.repubblica.it/UserFiles/limes/Image/Loghi/logo-limes.gif'
title = 'Limes'
category = 'Geopolitical news'
language = 'it'
# encoding = 'cp1252'
timefmt = '[%a, %d %b, %Y]'
oldest_article = 16
max_articles_per_feed = 100
use_embedded_content = False
recursion = 10
remove_javascript = True
no_stylesheets = True
masthead_url = 'http://temi.repubblica.it/UserFiles/limes/Image/Loghi/logo-limes.gif'
feeds = [
(u'Limes', u'http://temi.repubblica.it/limes/feed/')
]
keep_only_tags = [
dict(name='div', attrs={'class':['testo','copertina','occhiello','firma','didascalia','content-second-right','detail-articles','titolo-local','generic-articles']}),
dict(name='div', attrs={'class':['generic-articles','summary','detail-articles']}),
dict(name='div', attrs={'id':['content-second-right','content2']})
]
remove_tags = [
dict(name='div',attrs={'class':['servizi','aggiungi','label-web','bottom-mobile','box-abbonamenti','box-cerca','big','little','stampaweb']}),
dict(name='div',attrs={'id':['topheader','header','navigation-new','navigation','content-second-left','menutext']}),
dict(name='ul',attrs={'id':'user-utility'}),
dict(name=['script','noscript','iframe'])
]

View File

@ -1,11 +1,13 @@
__license__ = 'GPL v3'
__author__ = 'faber1971'
description = 'Collection of Italian marketing websites - v1.04 (17, March 2012)'
from calibre.web.feeds.news import BasicNewsRecipe
class AdvancedUserRecipe1327062445(BasicNewsRecipe):
title = u'Marketing Magazine'
description = 'Collection of Italian marketing websites'
language = 'it'
__author__ = 'faber1971'
oldest_article = 7
max_articles_per_feed = 100
auto_cleanup = True
@ -16,4 +18,4 @@ class AdvancedUserRecipe1327062445(BasicNewsRecipe):
dict(name='ul', attrs={'id':'ads0'})
]
masthead_url = 'http://www.simrendeogun.com/wp-content/uploads/2011/06/New-Marketing-Magazine-Logo.jpg'
feeds = [(u'My Marketing', u'http://feed43.com/0537744466058428.xml'), (u'My Marketing_', u'http://feed43.com/8126723074604845.xml'), (u'Venturini', u'http://robertoventurini.blogspot.com/feeds/posts/default?alt=rss'), (u'Ninja Marketing', u'http://feeds.feedburner.com/NinjaMarketing'), (u'Comunitàzione', u'http://www.comunitazione.it/feed/novita.asp'), (u'Brandforum news', u'http://www.brandforum.it/rss/news'), (u'Brandforum papers', u'http://www.brandforum.it/rss/papers'), (u'MarketingArena', u'http://feeds.feedburner.com/marketingarena'), (u'minimarketing', u'http://feeds.feedburner.com/minimarketingit'), (u'Marketing Journal', u'http://feeds.feedburner.com/marketingjournal/jPwA'), (u'Disambiguando', u'http://giovannacosenza.wordpress.com/feed/')]
feeds = [(u'My Marketing', u'http://feed43.com/0537744466058428.xml'), (u'My Marketing_', u'http://feed43.com/8126723074604845.xml'), (u'MarketingArena', u'http://feeds.feedburner.com/marketingarena'), (u'Marketing Journal', u'http://feeds.feedburner.com/marketingjournal/jPwA'), (u'Venturini', u'http://robertoventurini.blogspot.com/feeds/posts/default?alt=rss'), (u'Brandforum news', u'http://www.brandforum.it/rss/news'), (u'Brandforum papers', u'http://www.brandforum.it/rss/papers'), (u'minimarketing', u'http://feeds.feedburner.com/minimarketingit'), (u'[4]marketing.biz', u'http://feeds.feedburner.com/4marketing'), (u'Ninja Marketing', u'http://feeds.feedburner.com/NinjaMarketing'), (u'Bloguerrilla', u'http://feeds.feedburner.com/Bloguerrilla'), (u'Nonconvenzionale', u'http://feeds.feedburner.com/nonconvenzionale'), (u'Comunitàzione', u'http://www.comunitazione.it/feed/novita.asp'), (u'Disambiguando', u'http://giovannacosenza.wordpress.com/feed/')]

View File

@ -3,25 +3,6 @@ from calibre.web.feeds.news import BasicNewsRecipe
import re
from calibre.utils.magick import Image
from BeautifulSoup import BeautifulSoup
try:
from calibre_plugins.drMerry.debug import debuglogger as mlog
print 'drMerry debuglogger found, debug options can be used'
from calibre_plugins.drMerry.stats import statslogger as mstat
print 'drMerry stats tracker found, stat can be tracked'
mlog.setLoglevel(1) #-1 == no log; 0 for normal output
mstat.calculateStats(False) #track stats (to track stats loglevel must be > 0
KEEPSTATS = mstat.keepmystats()
SHOWDEBUG0 = mlog.showdebuglevel(0)
SHOWDEBUG1 = mlog.showdebuglevel(1)
SHOWDEBUG2 = mlog.showdebuglevel(2)
except:
#print 'drMerry debuglogger not found, skipping debug options'
SHOWDEBUG0 = False
SHOWDEBUG1 = False
SHOWDEBUG2 = False
KEEPSTATS = False
#print ('level0: %s\nlevel1: %s\nlevel2: %s' % (SHOWDEBUG0,SHOWDEBUG1,SHOWDEBUG2))
''' Version 1.2, updated cover image to match the changed website.
added info date on title
@ -43,80 +24,75 @@ except:
extended timeout from 2 to 10
changed oldest article from 10 to 1.2
changed max articles from 15 to 25
Version 1.9.1 18-04-2012
removed some debug settings
updated code to match new metro-layout
Version 1.9.2 24-04-2012
updated code to match new metro-layout
Version 1.9.3 25-04-2012
Changed a lot of custom code into calibre code as the default code of calibre has become much faster since the first version fo this recipe
Added new feeds
Updated css
Changed order of regex to speedup proces
'''
class AdvancedUserRecipe1306097511(BasicNewsRecipe):
title = u'Metro Nieuws NL'
oldest_article = 1.2
max_articles_per_feed = 25
__author__ = u'DrMerry'
description = u'Metro Nederland'
language = u'nl'
simultaneous_downloads = 3
__author__ = u'DrMerry'
description = u'Metro Nederland'
language = u'nl'
simultaneous_downloads = 5
masthead_url = 'http://blog.metronieuws.nl/wp-content/themes/metro/images/header.gif'
timeout = 10
center_navbar = True
timefmt = ' [%A, %d %b %Y]'
center_navbar = True
timefmt = ' [%A, %d %b %Y]'
no_stylesheets = True
remove_javascript = True
remove_empty_feeds = True
cover_url = 'http://www.oldreadmetro.com/img/en/metroholland/last/1/small.jpg'
cover_url = 'http://www.oldreadmetro.com/img/en/metroholland/last/1/small.jpg'
publication_type = 'newspaper'
encoding = 'utf-8'
remove_attributes = ['style', 'font', 'width', 'height']
encoding = 'utf-8'
remove_attributes = ['style', 'font', 'width', 'height', 'itemtype', 'itemprop', 'itemscope']#, 'href']
use_embedded_content = False
conversion_options = {
'authors' : 'Metro Nederland & calibre & DrMerry',
'author_sort' : 'Metro Nederland & calibre & DrMerry',
'publisher' : 'DrMerry/Metro Nederland'
}
extra_css = 'body {padding:5px 0px; background:#fff;font-size: 13px;}\
#date, div.share-and-byline div.byline div.text div.title, div.share-and-byline div.byline div.text div.name {clear: both;margin-bottom: 10px;font-size:0.5em; color: #616262;}\
.article-box-fact.module-title {clear:both;padding: 8px 0;color: #24763b;font-family: arial, sans-serif;font-size: 14px;font-weight: bold;}\
h1.title {color: #000000;font-size: 44px;padding-bottom: 10px;font-weight: 300;} h2.subtitle {font-size: 13px;font-weight: 700;padding-bottom: 10px;}\
.article-body p{padding-bottom:10px;}div.column-1-3{margin-left: 19px;padding-right: 9px;}\
div.column-1-2 {display: inline;padding-right: 7px;}\
p.article-image-caption {font-size: 12px;font-weight: 300;color: #616262;margin-top: 5px;} \
p.article-image-caption .credits {font-style: italic;font-size: 10px;}\
div.article-image-caption {width: 246px;margin-bottom: 5px;margin-left: 10px;}\
div.article-image-caption-2column {margin-bottom: 10px;width: 373px;} div.article-image-caption-3column {}\
img {border:0px; padding:2px;} hr.merryhr {width:30%; border-width:0px; color:green; margin-left:5px; background-color: green} div.column-3 {background-color:#eee; width:50%; margin:2px; float:right; padding:2px;} div.column-3 module-title {border: 1px solid #aaa} div.article-box-fact div.subtitle {font-weight:bold; color:green;}'
extra_css = 'body{font-size:1em;padding:5px 0}body,a,h2{background-color:#fff;text-decoration:none;color:#000}#date,div.byline,p.article-image-caption .credits,.calibrenavbar{font-size:.5em}.article-box-fact.module-title,#date,div.byline{clear:both}.article-box-fact.module-title{margin:8px 0}.article-box-fact.module-title,h2{font-size:1.1em}h1.title{font-size:1.4em}h1.title,.article-body p,div.article-image-caption-2column,div.article-image-caption-3column,#date,div.byline{margin-bottom:.6em}div.article-box-fact div.subtitle,.article-box-fact.module-title,h1.title,p.article-image-caption{font-weight:700}div.column-1-3{margin-left:19px}div.column-1-2{display:inline}div.column-1-2,div.column-1-3{margin-right:7px}p.article-image-caption{font-size:.6em;margin-top:5px}p.article-image-caption,#date,div.byline{color:#616262}p.article-image-caption .credits{font-style:italic}div.article-image-caption{width:246px}div.article-image-caption-2column{width:373px}div.column-3{background-color:#eee;float:right;width:50%}div.column-3 module-title{border:1px solid #aaa}div.article-box-fact div.subtitle,.article-box-fact.module-title{color:#24763b}div.byline{border-top:2px solid #24763b}div.column-3,img,div.column-3,p.small,div.article-image-caption{margin:.5em}img,p.small,.column1,h2{border:0;padding:0}.column1,h1,h2{margin:0}'
preprocess_regexps = [
(re.compile(r'<img[^>]+top-line[^>]+>', re.DOTALL|re.IGNORECASE),
lambda match: '<hr class="merryhr" />'),
(re.compile(r'(<img[^>]+metronieuws\.nl/[^>]+/templates/[^>]+jpe?g[^>]+>|metronieuws\.nl/internal\-roxen\-unit\.gif)', re.DOTALL|re.IGNORECASE),
lambda match: ''),
(re.compile(r'(&nbsp;|\s|<img[^>]+metronieuws\.nl/([^>]+/templates/[^>]+\.jpe?g|internal\-roxen\-unit\.gif)[^>]+>)', re.DOTALL|re.IGNORECASE),lambda match: ' '),
#(re.compile(r'(&nbsp;|\s)+', re.DOTALL|re.IGNORECASE),lambda match:' '),
#(re.compile(r'<(a |/a)[^>]*>', re.DOTALL|re.IGNORECASE),lambda match:'')
#(re.compile('(</?)h2', re.DOTALL|re.IGNORECASE),lambda match:'\1em')
]
remove_tags_before= dict(id='date')
remove_tags_after = [dict(name='div', attrs={'class':['column-1-3','gallery-text']})]#id='share-and-byline')]
remove_tags = [
dict(name=['iframe','script','noscript','style']),
dict(name='div', attrs={'class':[re.compile('column-[14]-5'),'col-179 ','col-373 ','clear','ad','navigation',re.compile('share-tools(-top)?'),'tools','metroCommentFormWrap','article-tools-below-title','related-links','padding-top-15',re.compile('^promo.*?$'),'teaser-component',re.compile('fb(-comments|_iframe_widget)')]}),
dict(id=['column-1-5-bottom','column-4-5',re.compile('^ad(\d+|adcomp.*?)?$'),'sidebar',re.compile('^article-\d'),'comments','gallery-1']),
dict(name='a', attrs={'name':'comments'}),
#dict(name='div', attrs={'data-href'}),
dict(name='img', attrs={'class':'top-line'}),
dict(attrs={'style':re.compile('^(.*(display\s?:\s?none|img-mask|white)\s?;?.*)$'),'title':'volledig scherm'})]
'''removed by before/after:
id:
column-1-5-top,'hidden_div','footer',
class:
'header',re.compile('^footer-[a-zA-Z0-9]+$),'header-links',
'''
def preprocess_html(self, soup):
if SHOWDEBUG0 == True:
mlog.setdefaults()
mlog.addTextAndTag(['Show debug = on with level'], [str(mlog.debuglevel)])
if KEEPSTATS == True:
mlog.addDebug('Stats will be calculated')
else:
mlog.addTextAndTag(['Stats won\'t be calculated\nTo be enabled, stats must be true, currently','and debug level must be 1 or higher, currently'],[mstat.dokeepmystats, mlog.debuglevel])
mlog.showDebug()
myProcess = MerryProcess()
myProcess.moveTitleAndAuthor(soup)
myProcess.removeUnwantedTags(soup)
return soup
def postprocess_html(self, soup, first):
myProcess = MerryProcess()
myProcess.optimizeLayout(soup)
if SHOWDEBUG0 == True:
if KEEPSTATS == True:
statinfo = 'generated stats:'
statinfo += str(mstat.stats(mstat.statslist))
print statinfo
statinfo = 'generated stats (for removed tags):'
statinfo += str(mstat.stats(mstat.removedtagslist))
print statinfo
#show all Debug info we forgot to report
#Using print to be sure that this text will not be added at the end of the log.
print '\n!!!!!unreported messages:\n(should be empty)\n'
mlog.showDebug()
return soup
feeds = [
@ -128,295 +104,109 @@ class AdvancedUserRecipe1306097511(BasicNewsRecipe):
(u'Buitenland', u'http://www.metronieuws.nl/rss.xml?c=1277377288-4'),
(u'Columns', u'http://www.metronieuws.nl/rss.xml?c=1277377288-17'),
(u'Entertainment', u'http://www.metronieuws.nl/rss.xml?c=1277377288-2'),
(u'Dot', u'http://www.metronieuws.nl/rss.xml?c=1283166782-12'),
(u'Strips',u'http://www.metronieuws.nl/rss.xml?c=1325037714-0'),
(u'Tech', u'http://www.metronieuws.nl/rss.xml?c=1283166782-12'),
(u'Familie', u'http://www.metronieuws.nl/rss.xml?c=1283166782-9'),
(u'Blogs', u'http://www.metronieuws.nl/rss.xml?c=1295586825-6'),
(u'Reizen', u'http://www.metronieuws.nl/rss.xml?c=1277377288-13'),
(u'Carri&egrave;re', u'http://www.metronieuws.nl/rss.xml?c=1278070988-1'),
(u'Carrière', u'http://www.metronieuws.nl/rss.xml?c=1278070988-1'),
(u'Wetenschap',u'http://www.metronieuws.nl/rss.xml?c=1303088437-0'),
(u'Planeet',u'http://www.metronieuws.nl/rss.xml?c=1277377288-14'),
(u'Gezondheid',u'http://www.metronieuws.nl/rss.xml?c=1277377288-15'),
(u'Sport', u'http://www.metronieuws.nl/rss.xml?c=1277377288-12')
]
class MerryPreProcess():
def replacePictures(self, soup):
#to be implemented
return soup
def optimizePicture(self,soup):
if SHOWDEBUG0 == True:
mlog.addDebug('start image optimize')
for tag in soup.findAll(lambda tag: tag.name.lower()=='img' and tag.has_key('src')):
iurl = tag['src']
img = Image()
img.open(iurl)
img.trim(0)
img.save(iurl)
if SHOWDEBUG0 == True:
mlog.addDebug('Images optimized')
mlog.showDebug()
try:
iurl = tag['src']
img = Image()
img.open(iurl)
img.trim(0)
img.save(iurl)
except:
print '\n!!image optimize failed!!\n'
continue
return soup
class MerryExtract():
def safeRemovePart(self, killingSoup, soupIsArray):
if killingSoup and not killingSoup == None:
if SHOWDEBUG2 == True:
mlog.addTextAndTag(['items to remove'],[killingSoup])
try:
if soupIsArray == True:
for killer in killingSoup:
killer.extract()
else:
killingSoup.extract()
if SHOWDEBUG1 == True:
mlog.addDebug('tag extracted')
mlog.showDebug()
if KEEPSTATS == True:
try:
mstat.addstat(mstat.removedtagslist,str(killingSoup.name))
except:
mstat.addstat(mstat.removedtagslist,'unknown')
except:
if SHOWDEBUG1 == True:
mlog.addDebug('tag extraction failed')
mlog.showDebug()
if KEEPSTATS == True:
mstat.addstat(mstat.removedtagslist,'exception')
return False
else:
return False
return killingSoup
class MerryReplace():
myKiller = MerryExtract()
def replaceATag(self, soup):
anchors = []
anchors = soup.findAll('a')
if anchors and not (anchors == None or anchors == []):
try:
for link in anchors:
# print str(link)
if link and not link == None:
# print ('type: %s'%(str(type(link))))
# print ('link: %s' % (link))
myParent = link.parent
# print str('parent: %s'%(myParent))
try:
myIndex = link.parent.index(link)
hasIndex = True
except:
myIndex = 0
hasIndex = False
# print str('index %s'%(myIndex))
if not link.string == None:
# print 'link=notnone'
if hasIndex == True:
myParent.insert(myIndex, link.string)
else:
myParent.append(link.string)
else:
# print 'link=none'
myParent.insert(myIndex, link.contents)
self.myKiller.safeRemovePart(link, False)
else:
notshown = 'tag received is empty' # print
except:
notshown = 'tag received is empty' # print
notshown
return soup
class MerryProcess(BeautifulSoup):
myKiller = MerryExtract()
myReplacer = MerryReplace()
myPrepare = MerryPreProcess()
def optimizeLayout(self,soup):
self.myPrepare.optimizePicture(soup)
if SHOWDEBUG0 == True:
mlog.addDebug('End of Optimize Layout')
mlog.showDebug()
return soup
def insertFacts(self, soup):
allfacts = soup.findAll('div', {'class':re.compile('^article-box-fact.*$')})
if SHOWDEBUG0 == True:
mlog.addTextAndTag(['allfacts'],[allfacts])
mlog.showDebug()
thefactpart = re.compile('^article-box-fact.*$')
allfacts = soup.findAll('div', {'class':thefactpart})
if allfacts and not allfacts == None:
allfactsparent = soup.find('div', {'class':re.compile('^article-box-fact.*$')}).parent
if SHOWDEBUG0 == True:
mlog.addTextAndTag(['allfactsparent'],[allfactsparent])
mlog.showDebug()
allfactsparent = soup.find('div', {'class':thefactpart}).parent
for part in allfactsparent:
if not part in allfacts:
if SHOWDEBUG0 == True:
mlog.addTextAndTag(['FOUND A non-fact'],[part])
mlog.showDebug()
self.myKiller.safeRemovePart(part, True)
if SHOWDEBUG1 == True:
mlog.addTextAndTag(['New All Facts'],[allfacts])
mlog.showDebug()
articlefacts = soup.find('div', {'class':'article-box-fact column'})
errorOccured=False
if (articlefacts and not articlefacts==None):
try:
contenttag = soup.find('div', {'class':'article-body'})
if SHOWDEBUG0 == True:
mlog.addTextAndTag(['curcontag'],[contenttag])
mlog.showDebug()
foundrighttag = False
if contenttag and not contenttag == None:
foundrighttag = True
if SHOWDEBUG0 == True:
if errorOccured == False:
mlog.addTextAndTag(['type','curcontag (in while)'],[type(contenttag),contenttag])
else:
mlog.addDebug('Could not find right parent tag. Error Occured')
mlog.showDebug()
if foundrighttag == True:
contenttag.insert(0, allfactsparent)
if SHOWDEBUG2 == True:
mlog.addTextAndTag(['added parent'],[soup.prettify()])
mlog.showDebug()
except:
errorOccured=True
mlog.addTrace()
else:
errorOccured=True
if SHOWDEBUG0 == True and errorOccured == True:
mlog.addTextAndTag(['no articlefacts'],[articlefacts])
mlog.showDebug()
pass
return soup
def moveTitleAndAuthor(self, soup):
moveitem = soup.h1
pubdate = soup.find(id="date")
if moveitem and not moveitem == None and pubdate and not pubdate == None:
try:
pubdate.parent.insert(0, moveitem)
except:
print '\n!!error in moving title!!\n'
pass
moveitem = None
moveitem = soup.find('div', {'class':'byline'})
if moveitem and not moveitem == None:
try:
moveitem.parent.parent.insert(-1, moveitem)
except:
print '\n!!error in moving byline!!\n'
pass
return soup
def previousNextSibRemover(self, soup, previous=True, soupIsArray=False):
findsibsof = soup
firstpart = previous
if findsibsof and not findsibsof == None:
if soupIsArray == True:
for foundsib in findsibsof:
self.previousNextSibRemover(foundsib, firstpart, soupIsArray=False)
else:
if firstpart == True and soupIsArray == False:
sibs = findsibsof.previousSiblingGenerator()
else:
sibs = findsibsof.nextSiblingGenerator()
for sib in sibs:
self.myKiller.safeRemovePart(sib, True)
else:
if SHOWDEBUG1 == True:
mlog.addDebug('Not any sib found')
return
def removeUnwantedTags(self,soup):
if SHOWDEBUG1 == True:
mlog.addTextAndTag(['Len of Soup before RemoveTagsByName'],[len(str(soup))])
mlog.showDebug()
self.removeTagsByName(soup)
if SHOWDEBUG1 == True:
mlog.addDebug('Len of Soup before firstandlastpart: %s' % len(str(soup)))
mlog.showDebug()
self.insertFacts(soup)
self.removeFirstAndLastPart(soup)
if SHOWDEBUG1 == True:
mlog.addDebug('Len of Soup before unwantedpart: %s' % len(str(soup)))
mlog.showDebug()
self.removeUnwantedParts(soup)
if SHOWDEBUG1 == True:
mlog.addDebug('Len of Soup before EmptyParts: %s' % len(str(soup)))
mlog.showDebug()
self.removeEmptyTags(soup)
if SHOWDEBUG1 == True:
mlog.addDebug('Len of Soup after EmptyParts: %s' % len(str(soup)))
mlog.showDebug()
self.myReplacer.replaceATag(soup)
return soup
def removeUnwantedParts(self, soup):
if SHOWDEBUG1 == True:
mlog.addDebug('Len of Soup before UnwantedID: %s' % len(str(soup)))
mlog.showDebug()
self.removeUnwantedTagsByID(soup)
if SHOWDEBUG1 == True:
mlog.addDebug('Len of Soup before Class: %s' % len(str(soup)))
mlog.showDebug()
self.removeUnwantedTagsByClass(soup)
if SHOWDEBUG1 == True:
mlog.addDebug('Len of Soup before Style: %s' % len(str(soup)))
mlog.showDebug()
self.removeUnwantedTagsByStyle(soup)
return soup
def removeUnwantedTagsByStyle(self,soup):
self.removeArrayOfTags(soup.findAll(attrs={'style' : re.compile("^(.*(display\s?:\s?none|img-mask|white)\s?;?.*)$")}))
if SHOWDEBUG0 == True:
mlog.addDebug('end remove by style')
self.removeArrayOfTags(soup.findAll(attrs={'class': 'share-tools-bottom'})) # at end to keep author
return soup
def removeArrayOfTags(self,souparray):
return self.myKiller.safeRemovePart(souparray, True)
def removeUnwantedTagsByClass(self,soup):
if SHOWDEBUG0 == True:
mlog.addDebug('start remove by class')
self.removeArrayOfTags(soup.findAll("div", { "class" :re.compile('^(promo.*?|article-tools-below-title|metroCommentFormWrap|ad|share-tools|tools|header-links|related-links|padding-top-15)$')}))
return soup
def removeUnwantedTagsByID(self,soup):
defaultids = ['footer-extra',re.compile('^ad(\d+|adcomp.*?)?$'),'column-4-5','navigation','header',re.compile('^column-1-5-(top|bottom)$'),'footer','hidden_div','sidebar',re.compile('^article-\d$'),'comments','footer']
for removeid in defaultids:
if SHOWDEBUG1 == True:
mlog.addDebug('RemoveTagByID, tag: %s, Len of Soup: %s' % (str(removeid), len(str(soup))))
mlog.showDebug()
self.removeArrayOfTags(soup.findAll(id=removeid))
return soup
# def safeRemoveTag(self, subtree):
# return self.myKiller.safeRemovePart(subtree, True)
def removeTagsByName(self, soup):
self.myKiller.safeRemovePart(soup.script, True)
self.myKiller.safeRemovePart(soup.iframe, True)
self.myKiller.safeRemovePart(soup.style, True)
self.myKiller.safeRemovePart(soup.noscript, True)
return soup
def removeEmptyTags(self,soup,run=0):
if SHOWDEBUG0 == True:
mlog.addDebug('starting removeEmptyTags')
if SHOWDEBUG1 == True:
run += 1
mlog.addDebug(run)
if SHOWDEBUG2 == True:
mlog.addDebug(str(soup.prettify()))
mlog.showDebug()
emptymatches = re.compile('^(&nbsp;|\s|\n|\r|\t)*$')
emptymatches = re.compile('^[&nbsp;\s\n\r\t ]*$')
emptytags = soup.findAll(lambda tag: tag.find(True) is None and (tag.string is None or tag.string.strip()=="" or tag.string.strip()==emptymatches) and not tag.isSelfClosing)
if emptytags and not (emptytags == None or emptytags == []):
if SHOWDEBUG1 == True:
mlog.addDebug('tags found')
mlog.addDebug(str(emptytags))
self.removeArrayOfTags(emptytags)
#recursive in case removing empty tag creates new empty tag
self.removeEmptyTags(soup, run=run)
else:
if SHOWDEBUG1 == True:
mlog.addDebug('no empty tags found')
mlog.showDebug()
if SHOWDEBUG0 == True:
if SHOWDEBUG2 == True:
mlog.addDebug('new soup:')
mlog.addDebug(str(soup.prettify()))
mlog.addDebug('RemoveEmptyTags Completed')
mlog.showDebug()
return soup
def removeFirstAndLastPart(self,soup):
def findparenttag(lookuptag):
if lookuptag and not lookuptag == None:
return lookuptag.findParents()
findtag = soup.find(id="date")
self.previousNextSibRemover(findtag, previous=True, soupIsArray=False)
self.previousNextSibRemover(findparenttag(findtag), previous=True, soupIsArray=True)
for endtag in [soup.find(id="share-and-byline"), soup.find("div", { "class" : "gallery-text" })]:
self.previousNextSibRemover(endtag, previous=False, soupIsArray=False)
self.previousNextSibRemover(findparenttag(endtag), previous=False, soupIsArray=True)
return soup
return soup

View File

@ -1,52 +1,30 @@
import re
from calibre.web.feeds.news import BasicNewsRecipe
class AdvancedUserRecipe1306097511(BasicNewsRecipe):
title = u'Metro UK'
description = 'News as provide by The Metro -UK'
#timefmt = ''
__author__ = 'Dave Asbury'
#last update 3/12/11
cover_url = 'http://profile.ak.fbcdn.net/hprofile-ak-snc4/276636_117118184990145_2132092232_n.jpg'
no_stylesheets = True
#no_stylesheets = True
oldest_article = 1
max_articles_per_feed = 20
max_articles_per_feed = 10
remove_empty_feeds = True
remove_javascript = True
auto_cleanup = True
#preprocess_regexps = [(re.compile(r'Tweet'), lambda a : '')]
preprocess_regexps = [
(re.compile(r'<span class="img-cap legend">', re.IGNORECASE | re.DOTALL), lambda match: '<p></p><span class="img-cap legend"> ')]
preprocess_regexps = [
(re.compile(r'tweet', re.IGNORECASE | re.DOTALL), lambda match: '')]
language = 'en_GB'
masthead_url = 'http://e-edition.metro.co.uk/images/metro_logo.gif'
keep_only_tags = [
dict(name='h1'),dict(name='h2', attrs={'class':'h2'}),
dict(attrs={'class':['img-cnt figure']}),
dict(attrs={'class':['art-img']}),
dict(name='div', attrs={'class':'art-lft'}),
dict(name='p')
]
remove_tags = [
dict(name = 'div',attrs={'id' : ['comments-news','formSubmission']}),
dict(name='div', attrs={'class':[ 'news m12 clrd clr-b p5t shareBtm', 'commentForm', 'metroCommentInnerWrap',
'art-rgt','pluck-app pluck-comm','news m12 clrd clr-l p5t', 'flt-r','username','clrd' ]}),
dict(attrs={'class':['username', 'metroCommentFormWrap','commentText','commentsNav','avatar','submDateAndTime','addYourComment','displayName']})
,dict(name='div', attrs={'class' : 'clrd art-fd fd-gr1-b'})
]
feeds = [
(u'News', u'http://www.metro.co.uk/rss/news/'), (u'Money', u'http://www.metro.co.uk/rss/money/'), (u'Sport', u'http://www.metro.co.uk/rss/sport/'), (u'Film', u'http://www.metro.co.uk/rss/metrolife/film/'), (u'Music', u'http://www.metro.co.uk/rss/metrolife/music/'), (u'TV', u'http://www.metro.co.uk/rss/tv/'), (u'Showbiz', u'http://www.metro.co.uk/rss/showbiz/'), (u'Weird News', u'http://www.metro.co.uk/rss/weird/'), (u'Travel', u'http://www.metro.co.uk/rss/travel/'), (u'Lifestyle', u'http://www.metro.co.uk/rss/lifestyle/'), (u'Books', u'http://www.metro.co.uk/rss/lifestyle/books/'), (u'Food', u'http://www.metro.co.uk/rss/lifestyle/restaurants/')]
extra_css = '''
body {font: sans-serif medium;}'
h1 {text-align : center; font-family:Arial,Helvetica,sans-serif; font-size:20px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:bold;}
h2 {text-align : center;color:#4D4D4D;font-family:Arial,Helvetica,sans-serif; font-size:15px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:bold; }
span{ font-size:9.5px; font-weight:bold;font-style:italic}
p { text-align: justify; font-family:Arial,Helvetica,sans-serif; font-size:11px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:normal;}
'''
body{ text-align: justify; font-family:Arial,Helvetica,sans-serif; font-size:11px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:normal;}
'''

View File

@ -9,8 +9,9 @@ from calibre.web.feeds.recipes import BasicNewsRecipe
class recipeMagic(BasicNewsRecipe):
title = 'National Geographic PL'
__author__ = 'Marcin Urban 2011'
__modified_by__ = 'fenuks'
description = 'legenda wśród magazynów z historią sięgającą 120 lat'
cover_url = 'http://www.guj.pl/var/guj/storage/images/media/nasze_magazyny/national_geographic/logo/ng_logo/2606-1-pol-PL/ng_logo.jpg'
#cover_url = 'http://www.guj.pl/var/guj/storage/images/media/nasze_magazyny/national_geographic/logo/ng_logo/2606-1-pol-PL/ng_logo.jpg'
oldest_article = 7
max_articles_per_feed = 100
no_stylesheets = True
@ -42,11 +43,43 @@ class recipeMagic(BasicNewsRecipe):
]
remove_attributes = ['width','height']
feeds=[]
feeds = [
('National Geographic PL', 'http://www.national-geographic.pl/rss/'),
]
def find_articles(self, url):
articles = []
soup=self.index_to_soup(url)
tag=soup.find(attrs={'class':'arl'})
art=tag.ul.findAll('li')
for i in art:
title=i.a['title']
url=i.a['href']
#date=soup.find(id='footer').ul.li.string[41:-1]
desc=i.div.p.string
articles.append({'title' : title,
'url' : url,
'date' : '',
'description' : desc
})
return articles
def parse_index(self):
feeds = []
feeds.append((u"Aktualności", self.find_articles('http://www.national-geographic.pl/aktualnosci/')))
feeds.append((u"Artykuły", self.find_articles('http://www.national-geographic.pl/artykuly/')))
return feeds
def print_version(self, url):
return url.replace('artykuly0Cpokaz', 'drukuj-artykul')
if 'artykuly' in url:
return url.replace('artykuly/pokaz', 'drukuj-artykul')
elif 'aktualnosci' in url:
return url.replace('aktualnosci/pokaz', 'drukuj-artykul')
else:
return url
def get_cover_url(self):
soup = self.index_to_soup('http://www.national-geographic.pl/biezace-wydania/')
tag=soup.find(attrs={'class':'txt jus'})
self.cover_url=tag.img['src']
return getattr(self, 'cover_url', self.cover_url)

View File

@ -0,0 +1,16 @@
from calibre.web.feeds.news import BasicNewsRecipe
class AdvancedUserRecipe1335362999(BasicNewsRecipe):
title = u'Non leggerlo'
oldest_article = 7
max_articles_per_feed = 100
auto_cleanup = False
keep_only_tags = [
dict(name='div', attrs={'class':'post hentry'})
]
feeds = [(u'Non leggerlo', u'http://nonleggerlo.blogspot.com/feeds/posts/default')]
description = 'An Italian satirical blog'
language = 'it'
__author__ = 'faber1971'
__version__ = 'v1.0'
__date__ = '24, April 2012'

View File

@ -81,5 +81,7 @@ class Nowa_Fantastyka(BasicNewsRecipe):
title=soup.find(attrs={'class':'tytul'})
if title:
title['style']='font-size: 20px; font-weight: bold;'
self.log.warn(soup)
for a in soup('a'):
if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']:
a['href']=self.INDEX + a['href']
return soup

View File

@ -1,3 +1,4 @@
import urllib, re
from calibre.web.feeds.news import BasicNewsRecipe
class AdvancedUserRecipe1279258912(BasicNewsRecipe):
@ -27,12 +28,30 @@ class AdvancedUserRecipe1279258912(BasicNewsRecipe):
encoding = 'utf-8'
conversion_options = {'linearize_tables':True}
masthead_url = 'http://www.orlandosentinel.com/media/graphic/2009-07/46844851.gif'
keep_only_tags = [
dict(name='div', attrs={'class':'story'})
]
remove_tags = [
dict(name='div', attrs={'class':['articlerail','tools','comment-group','clearfix']}),
]
remove_tags_after = [
dict(name='p', attrs={'class':'copyright'}),
]
auto_cleanup = True
def get_article_url(self, article):
ans = None
try:
s = article.summary
ans = urllib.unquote(
re.search(r'href=".+?bookmark.cfm.+?link=(.+?)"', s).group(1))
except:
pass
if ans is None:
link = article.get('feedburner_origlink', None)
if link and link.split('/')[-1]=="story01.htm":
link=link.split('/')[-2]
encoding = {'0B': '.', '0C': '/', '0A': '0', '0F': '=', '0G': '&',
'0D': '?', '0E': '-', '0N': '.com', '0L': 'http:',
'0S':'//'}
for k, v in encoding.iteritems():
link = link.replace(k, v)
ans = link
elif link:
ans = link
if ans is not None:
return ans.replace('?track=rss', '')

View File

@ -7,6 +7,7 @@ class PC_Arena(BasicNewsRecipe):
description = u'Najnowsze informacje z branży IT - testy, recenzje, aktualności, rankingi, wywiady. Twoje źródło informacji o sprzęcie komputerowym.'
category = 'IT'
language = 'pl'
index='http://pcarena.pl'
masthead_url='http://pcarena.pl/pcarena/img/logo.png'
cover_url= 'http://pcarena.pl/pcarena/img/logo.png'
no_stylesheets = True
@ -22,4 +23,10 @@ class PC_Arena(BasicNewsRecipe):
if 'http' not in url:
return 'http://pcarena.pl' + url
else:
return url
return url
def preprocess_html(self, soup):
for a in soup('a'):
if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']:
a['href']=self.index + a['href']
return soup

View File

@ -1,5 +1,5 @@
"""
readitlaterlist.com
Pocket Calibre Recipe v1.0
"""
__license__ = 'GPL v3'
__copyright__ = '''
@ -12,22 +12,23 @@ from calibre import strftime
from calibre.web.feeds.news import BasicNewsRecipe
class Readitlater(BasicNewsRecipe):
title = 'ReadItLater'
class Pocket(BasicNewsRecipe):
title = 'Pocket'
__author__ = 'Darko Miletic, Przemyslaw Kryger, Keith Callenberg, tBunnyMan'
description = '''Personalized news feeds. Go to readitlaterlist.com to setup \
up your news. This version displays pages of articles from \
description = '''Personalized news feeds. Go to getpocket.com to setup up \
your news. This version displays pages of articles from \
oldest to newest, with max & minimum counts, and marks articles \
read after downloading.'''
publisher = 'readitlaterlist.com'
publisher = 'getpocket.com'
category = 'news, custom'
oldest_article = 7
max_articles_per_feed = 50
minimum_articles = 1
minimum_articles = 10
mark_as_read_after_dl = True
no_stylesheets = True
use_embedded_content = False
needs_subscription = True
INDEX = u'http://readitlaterlist.com'
INDEX = u'http://getpocket.com'
LOGIN = INDEX + u'/l'
readList = []
@ -100,9 +101,31 @@ class Readitlater(BasicNewsRecipe):
br = self.get_browser()
for link in markList:
url = self.INDEX + link
print 'Marking read: ', url
response = br.open(url)
response
print response.info()
def cleanup(self):
self.mark_as_read(self.readList)
if self.mark_as_read_after_dl:
self.mark_as_read(self.readList)
else:
pass
def default_cover(self, cover_file):
'''
Create a generic cover for recipes that don't have a cover
This override adds time to the cover
'''
try:
from calibre.ebooks import calibre_cover
title = self.title if isinstance(self.title, unicode) else \
self.title.decode('utf-8', 'replace')
date = strftime(self.timefmt)
time = strftime('[%I:%M %p]')
img_data = calibre_cover(title, date, time)
cover_file.write(img_data)
cover_file.flush()
except:
self.log.exception('Failed to generate default cover')
return False
return True

View File

@ -0,0 +1,59 @@
from calibre.web.feeds.recipes import BasicNewsRecipe
class AdvancedUserRecipe1335532466(BasicNewsRecipe):
title = u'Richmond Times-Dispatch'
description = 'News from Richmond, Virginia, USA'
__author__ = 'jde'
cover_url = 'http://static2.dukecms.com/va_tn/timesdispatch_com/site-media/img/icons/logo252x97.png'
language = 'en'
encoding = 'utf8'
oldest_article = 1 #days
max_articles_per_feed = 25
needs_subscription = False
remove_javascript = True
recursions = 0
use_embedded_content = False
no_stylesheets = True
auto_cleanup = True
feeds = [
('News',
'http://www2.timesdispatch.com/list/feed/rss/news-archive'),
('Breaking News',
'http://www2.timesdispatch.com/list/feed/rss/breaking-news'),
('National News',
'http://www2.timesdispatch.com/list/feed/rss/national-news'),
('Local News',
'http://www2.timesdispatch.com/list/feed/rss/local-news'),
('Business',
'http://www2.timesdispatch.com/list/feed/rss/business'),
('Local Business',
'http://www2.timesdispatch.com/list/feed/rss/local-business'),
('Politics',
'http://www2.timesdispatch.com/list/feed/rss/politics'),
('Virginia Politics',
'http://www2.timesdispatch.com/list/feed/rss/virginia-politics'),
('Editorials',
'http://www2.timesdispatch.com/list/feed/rss/editorial-desk'),
('Columnists and Blogs',
'http://www2.timesdispatch.com/list/feed/rss/news-columnists-blogs'),
('Opinion Columnists',
'http://www2.timesdispatch.com/list/feed/rss/opinion-editorial-columnists'),
('Letters to the Editor',
'http://www2.timesdispatch.com/list/feed/rss/opinion-letters'),
('Traffic',
'http://www2.timesdispatch.com/list/feed/rss/traffic'),
('Sports',
'http://www2.timesdispatch.com/list/feed/rss/sports2'),
('Entertainment/Life',
'http://www2.timesdispatch.com/list/feed/rss/entertainment'),
('Movies',
'http://www2.timesdispatch.com/list/feed/rss/movies'),
('Music',
'http://www2.timesdispatch.com/list/feed/rss/music'),
('Dining & Food',
'http://www2.timesdispatch.com/list/feed/rss/dining'),
]

141
recipes/sol_haber.recipe Normal file
View File

@ -0,0 +1,141 @@
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2012, Onur Gungor onurgu@gmail.com'
__docformat__ = 'restructuredtext en'
'''
www.sol.org.tr
'''
import datetime
import re
from calibre.web.feeds.recipes import BasicNewsRecipe
class SolHaberRecipe(BasicNewsRecipe):
title = u'soL Haber'
oldest_article = 7
max_articles_per_feed = 100
language = 'tr'
__author__ = 'Onur Güngör'
description = 'Hayata soL''dan bakın..'
publisher = 'soL Haber'
tags = 'news, haberler, siyaset, türkiye, turkey, politics'
conversion_options = {
'comment' : description
, 'tags' : tags
, 'publisher' : publisher
, 'language' : language
}
category_dict = { 'sonuncu-kavga':'Sonuncu Kavga',
'devlet-ve-siyaset':'Devlet ve Siyaset',
'ekonomi':'Ekonomi',
'enternasyonal-gundem':'Enternasyonel Gündem',
'kent-gundemleri':'Kent Gündemleri',
'kultur-sanat':'Kültür Sanat',
'dunyadan':'Dünyadan',
'serbest-kursu':'Serbest Kürsü',
'medya':'Medya',
'liseliler':'Liseliler',
'yazarlar':'Köşe Yazıları'}
end_date = datetime.date.today().isoformat()
start_date = (datetime.date.today()-datetime.timedelta(days=1)).isoformat()
section_tuples = [['Köşe Yazıları', 'http://haber.sol.org.tr/arsiv?icerik=kose_yazisi&tarih%%5Bmin%%5D%%5Bdate%%5D=%s&tarih%%5Bmax%%5D%%5Bdate%%5D=%s' % (start_date, end_date)],
['Haberler', 'http://haber.sol.org.tr/arsiv?icerik=haber&tarih%%5Bmin%%5D%%5Bdate%%5D=%s&tarih%%5Bmax%%5D%%5Bdate%%5D=%s' % (start_date, end_date)],
['soL postal', 'http://haber.sol.org.tr/arsiv?icerik=postal&tarih%%5Bmin%%5D%%5Bdate%%5D=%s&tarih%%5Bmax%%5D%%5Bdate%%5D=%s' % (start_date, end_date)],
['Bizim Amerika', 'http://haber.sol.org.tr/arsiv?icerik=bizim_amerika&tarih%%5Bmin%%5D%%5Bdate%%5D=%s&tarih%%5Bmax%%5D%%5Bdate%%5D=%s' % (start_date, end_date)]]
# Disable stylesheets from site.
no_stylesheets = True
cover_margins = (20, 20, '#ffffff')
storybody_reg_exp = '^\s*(haber|kose)\s*$'
comments_reg_exp = '^\s*makale-elestiri\s*$'
remove_tags = [dict(name='div', attrs={'class':re.compile(comments_reg_exp, re.IGNORECASE)})]
keep_only_tags = [dict(name='div', attrs={'class':re.compile(storybody_reg_exp, re.IGNORECASE)})]
def get_masthead_title(self):
return self.title + "(" + self.end_date + ")"
def parse_index(self):
result = []
articles_dict = dict()
author_regexp = re.compile('^http://.*?/yazarlar/(.*?)/.*$')
category_regexp = re.compile('^http://.*?/(.+?)/.*$')
for section_tuple in self.section_tuples:
section_title = section_tuple[0]
section_index_url = section_tuple[1]
self.log('Bölüm:', section_title, 'URL:', section_index_url)
soup = self.index_to_soup(section_index_url)
logo = soup.find('div', id='logo').find('img', src=True)
if logo is not None:
self.cover_url = logo['src']
if self.cover_url.startswith('/'):
self.cover_url = 'http://haber.sol.org.tr'+self.cover_url
view_content = soup.find('div', id='ana-icerik').find('div', attrs={'class':'view-content'})
if view_content == None:
break
rows = view_content.find('tbody').findAll('tr')
self.log('Row sayısı', len(rows))
for row in rows:
cells = row.findAll('td')
a = cells[1].find('a', href=True)
url = a['href']
title = self.tag_to_string(a)
if url.startswith('/'):
url = 'http://haber.sol.org.tr'+url
category = section_title
category_match_result = category_regexp.match(url)
if category_match_result:
category = category_match_result.group(1)
date = self.tag_to_string(cells[2])
author = 'soL haber'
author_match_result = author_regexp.match(url)
if author_match_result:
author = author_match_result.group(1)
self.log('\tFound article:', title, 'at', url, 'published at ', date, 'by', author)
article = {'title':title, 'url':url, 'description':None, 'date':date, 'author':author}
if category in articles_dict:
articles_dict[category].append(article)
else:
articles_dict[category] = [article]
for category in articles_dict.keys():
if category in self.category_dict:
result.append((self.category_dict[category], articles_dict[category]))
else:
result.append((category, articles_dict[category]))
return result

View File

@ -0,0 +1,25 @@
from calibre.web.feeds.news import BasicNewsRecipe
class Swiat_Obrazu(BasicNewsRecipe):
title = u'Swiat Obrazu'
__author__ = 'fenuks'
description = u'Internetowy Dziennik o Fotografii i Wideo www.SwiatObrazu.pl to źródło informacji o technice fotografii i wideo, o sprzęcie najbardziej znanych i uznanych firm: Canon, Nikon, Sony, Hasselblad i wielu innych. Znajdziecie tu programy do obróbki zdjęć, forum foto i forum wideo i galerie zdjęć. Codziennie najświeższe informacje: aktualności, testy, poradniki, wywiady, felietony. Swiatobrazu.pl stale organizuje konkursy oraz warsztaty fotograficzne i wideo.'
category = 'photography'
masthead_url = 'http://www.swiatobrazu.pl/img/logo.jpg'
cover_url = 'http://www.swiatobrazu.pl/img/logo.jpg'
language = 'pl'
oldest_article = 7
max_articles_per_feed = 100
no_stylesheets = True
remove_javascript= True
use_embedded_content = False
feeds = [(u'Wszystko', u'http://www.swiatobrazu.pl/rss')]
def print_version(self, url):
return url + ',drukuj'
def image_url_processor(self, baseurl, url):
if 'http://' not in url or 'https://' not in url:
return 'http://www.swiatobrazu.pl' + url[5:]
else:
return url

View File

@ -34,7 +34,7 @@ class TagesspiegelRSS(BasicNewsRecipe):
no_javascript = True
remove_empty_feeds = True
encoding = 'utf-8'
remove_tags = [{'class':'hcf-header'}, {'class':'hcf-atlas'}, {'class':'hcf-date hcf-separate'}]
remove_tags = [{'class':'hcf-header'}, {'class':'hcf-atlas'}, {'class':'hcf-colon'}, {'class':'hcf-date hcf-separate'}]
def print_version(self, url):
url = url.split('/')
@ -51,6 +51,7 @@ class TagesspiegelRSS(BasicNewsRecipe):
return ''.join(div.findAll(text=True, recursive=False)).strip() if div is not None else None
articles = {}
links = set()
key = None
ans = []
maincol = soup.find('div', attrs={'class':re.compile('hcf-main-col')})
@ -59,7 +60,7 @@ class TagesspiegelRSS(BasicNewsRecipe):
if div['class'] == 'hcf-header':
try:
key = string.capwords(feed_title(div.em.a))
key = string.capwords(feed_title(div.em))
articles[key] = []
ans.append(key)
except:
@ -70,6 +71,12 @@ class TagesspiegelRSS(BasicNewsRecipe):
if not a:
continue
url = 'http://www.tagesspiegel.de' + a['href']
# check for duplicates
if url in links:
continue
links.add(url)
title = self.tag_to_string(a, use_alt=True).strip()
description = ''
pubdate = strftime('%a, %d %b')

View File

@ -34,4 +34,12 @@ class tanuki(BasicNewsRecipe):
def preprocess_html(self, soup):
self.append_page(soup, soup.body)
for a in soup('a'):
if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']:
if 'tanuki-anime' in soup.title.string.lower():
a['href']='http://anime.tanuki.pl' + a['href']
elif 'tanuki-manga' in soup.title.string.lower():
a['href']='http://manga.tanuki.pl' + a['href']
elif 'tanuki-czytelnia' in soup.title.string.lower():
a['href']='http://czytelnia.tanuki.pl' + a['href']
return soup

62
recipes/telam.recipe Normal file
View File

@ -0,0 +1,62 @@
__license__ = 'GPL v3'
__copyright__ = '2012, Darko Miletic <darko.miletic at gmail.com>'
'''
www.telam.com.ar
'''
from calibre.web.feeds.news import BasicNewsRecipe
class Telam(BasicNewsRecipe):
title = 'Telam'
__author__ = 'Darko Miletic'
description = 'AGENCIA DE NOTICIAS DE LA REPUBLICA ARGENTINA'
publisher = 'Telam S.E.'
category = 'news, politics, Argentina'
oldest_article = 2
max_articles_per_feed = 200
no_stylesheets = True
encoding = 'utf8'
use_embedded_content = False
language = 'es_AR'
remove_empty_feeds = True
publication_type = 'newsportal'
masthead_url = 'http://www.telam.com.ar/front/imagenes/encabezado/logotelam.jpg'
extra_css = """
body{font-family: Arial,Helvetica,sans-serif }
img{margin-bottom: 0.4em; display:block}
"""
conversion_options = {
'comment' : description
, 'tags' : category
, 'publisher' : publisher
, 'language' : language
}
remove_tags = [dict(name=['meta','link'])]
remove_tags_before = dict(attrs={'class':'nota_fecha'})
remove_tags_after = dict(attrs={'class':'nota_completa'})
remove_attributes = ['lang']
feeds = [
(u'Ultimas noticias', u'http://www.telam.com.ar/xml/rss/' )
,(u'Politica' , u'http://www.telam.com.ar/xml/rss/1')
,(u'Economia' , u'http://www.telam.com.ar/xml/rss/2')
,(u'Sociedad' , u'http://www.telam.com.ar/xml/rss/3')
,(u'Policiales' , u'http://www.telam.com.ar/xml/rss/4')
,(u'Internacionales' , u'http://www.telam.com.ar/xml/rss/6')
,(u'Espectaculos' , u'http://www.telam.com.ar/xml/rss/7')
,(u'Cultura' , u'http://www.telam.com.ar/xml/rss/8')
,(u'Deportes' , u'http://www.telam.com.ar/xml/rss/9')
,(u'Telam Investiga' , u'http://www.telam.com.ar/xml/rss/5')
]
def print_version(self, url):
artid = url.rpartition('/')[2]
return 'http://www.telam.com.ar/?codProg=imprimir-nota&id=' + artid
def preprocess_html(self, soup):
for item in soup.findAll(style=True):
del item['style']
return soup

View File

@ -1,9 +1,8 @@
import re
import re, mechanize
from calibre.web.feeds.recipes import BasicNewsRecipe
class AdvancedUserRecipe1325006965(BasicNewsRecipe):
title = u'The Sun UK'
cover_url = 'http://www.thesun.co.uk/img/global/new-masthead-logo.png'
description = 'A Recipe for The Sun tabloid UK'
__author__ = 'Dave Asbury'
@ -24,37 +23,69 @@ class AdvancedUserRecipe1325006965(BasicNewsRecipe):
no_stylesheets = True
extra_css = '''
body{ text-align: justify; font-family:Arial,Helvetica,sans-serif; font-size:11px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:normal;}
'''
body{ text-align: justify; font-family:Arial,Helvetica,sans-serif; font-size:11px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:normal;}
'''
preprocess_regexps = [
(re.compile(r'<div class="foot-copyright".*?</div>', re.IGNORECASE | re.DOTALL), lambda match: '')]
(re.compile(r'<div class="foot-copyright".*?</div>', re.IGNORECASE | re.DOTALL), lambda match: '')]
keep_only_tags = [
dict(name='h1'),dict(name='h2',attrs={'class' : 'medium centered'}),
dict(name='div',attrs={'class' : 'text-center'}),
dict(name='div',attrs={'id' : 'bodyText'})
# dict(name='p')
]
dict(name='div',attrs={'class' : 'text-center'}),
dict(name='div',attrs={'id' : 'bodyText'})
# dict(name='p')
]
remove_tags=[
#dict(name='head'),
dict(attrs={'class' : ['mystery-meat-link','ltbx-container','ltbx-var ltbx-hbxpn','ltbx-var ltbx-nav-loop','ltbx-var ltbx-url']}),
#dict(name='head'),
dict(attrs={'class' : ['mystery-meat-link','ltbx-container','ltbx-var ltbx-hbxpn','ltbx-var ltbx-nav-loop','ltbx-var ltbx-url']}),
dict(name='div',attrs={'class' : 'cf'}),
dict(attrs={'title' : 'download flash'}),
dict(attrs={'title' : 'download flash'}),
dict(attrs={'style' : 'padding: 5px'})
]
]
feeds = [
#(u'News', u'http://www.thesun.co.uk/sol/homepage/news/rss'),
(u'News','http://feed43.com/2517447382644748.xml'),
(u'Sport', u'http://feed43.com/4283846255668687.xml'),
(u'Bizarre', u'http://feed43.com/0233840304242011.xml'),
(u'Film',u'http://feed43.com/1307545221226200.xml'),
(u'Music',u'http://feed43.com/1701513435064132.xml'),
(u'Sun Woman',u'http://feed43.com/0022626854226453.xml'),
]
(u'News','http://feed43.com/2517447382644748.xml'),
(u'Sport', u'http://feed43.com/4283846255668687.xml'),
(u'Bizarre', u'http://feed43.com/0233840304242011.xml'),
(u'Film',u'http://feed43.com/1307545221226200.xml'),
(u'Music',u'http://feed43.com/1701513435064132.xml'),
(u'Sun Woman',u'http://feed43.com/0022626854226453.xml'),
]
def get_cover_url(self):
soup = self.index_to_soup('http://www.politicshome.com/uk/latest_frontpage.html')
# look for the block containing the sun button and url
cov = soup.find(attrs={'style' : 'background-image: url(http://www.politicshome.com/images/sources/source_frontpage_button_84.gif);'})
#cov = soup.find(attrs={'id' : 'large'})
cov2 = str(cov)
cov2='http://www.politicshome.com'+cov2[9:-133]
#cov2 now contains url of the page containing pic
#cov2 now contains url of the page containing pic
soup = self.index_to_soup(cov2)
cov = soup.find(attrs={'id' : 'large'})
cov2 = str(cov)
cov2=cov2[27:-18]
#cov2 now is pic url, now go back to original function
br = mechanize.Browser()
br.set_handle_redirect(False)
try:
br.open_novisit(cov2)
cover_url = cov2
except:
cover_url = 'http://www.thesun.co.uk/img/global/new-masthead-logo.png'
#cover_url = cov2
#cover_url = 'http://www.thesun.co.uk/img/global/new-masthead-logo.png'
return cover_url

View File

@ -11,6 +11,8 @@ class TPM_uk(BasicNewsRecipe):
__author__ = 'Darko Miletic'
description = 'Title says it all'
publisher = "The Philosophers' Magazine"
recipe_disabled = ('This recipe has been disabled as the website has'
' started providing articles only in PDF form')
category = 'philosophy, news'
oldest_article = 25
max_articles_per_feed = 200

View File

@ -2,65 +2,50 @@
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import with_statement
''' Changelog
2012-04-27 DrMerry:
Added cover picture
removed some extra tags
'''
__license__ = 'GPL v3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import re
from calibre.web.feeds.news import BasicNewsRecipe
class Tweakers(BasicNewsRecipe):
title = u'Tweakers.net - with Reactions'
__author__ = 'Roedi06'
title = u'Tweakers.net'
__author__ = 'Kovid Goyal'
language = 'nl'
oldest_article = 7
max_articles_per_feed = 100
cover_url = 'http://img51.imageshack.us/img51/7470/tweakersnetebook.gif'
oldest_article = 4
max_articles_per_feed = 40
cover_url = 'http://tweakers.net/ext/launch/g/logo.gif'
keep_only_tags = [dict(name='div', attrs={'class':'columnwrapper news'}),
{'id':'reacties'},
]
keep_only_tags = [dict(name='div', attrs={'class':'columnwrapper news'})]
remove_tags = [dict(name='div', attrs={'id' : ['utracker']}),
{'id' : ['channelNav']},
{'id' : ['contentArea']},
{'class' : ['breadCrumb']},
{'class' : ['nextPrevious ellipsis']},
{'class' : ['advertorial']},
{'class' : ['sidebar']},
{'class' : ['filterBox']},
{'id' : ['toggleButtonTxt']},
{'id' : ['socialButtons']},
{'class' : ['button']},
{'class' : ['textadTop']},
{'class' : ['commentLink']},
{'title' : ['Reageer op deze reactie']},
{'class' : ['pageIndex']},
{'class' : ['reactieHeader collapsed']},
remove_tags = [dict(name='div', attrs={'class':'reacties'}),
{'id' : ['utracker','socialButtons','b_ac']},
{'class' : ['sidebar','advertorial']},
{'class' : re.compile('nextPrevious')},
]
no_stylesheets=True
filter_regexps = [r'ads\.doubleclick\.net',r'ad\.doubleclick\.net']
preprocess_regexps = [
(re.compile(r'<hr*?>', re.IGNORECASE | re.DOTALL), lambda match : ''),
(re.compile(r'<p>', re.IGNORECASE | re.DOTALL), lambda match : ''),
(re.compile(r'</p>', re.IGNORECASE | re.DOTALL), lambda match : ''),
(re.compile(r'<a.*?>'), lambda h1: '<b><u>'),
(re.compile(r'</a>'), lambda h2: '</u></b>'),
(re.compile(r'<span class="new">', re.IGNORECASE | re.DOTALL), lambda match : ''),
(re.compile(r'</span>', re.IGNORECASE | re.DOTALL), lambda match : ''),
(re.compile(r'<div class="moderation"><img src="http://tweakimg.net/g/if/comments/score_0'), lambda match : ' - moderated 0<div class="moderation"><img src="http://tweakimg.net/g/if/comments/score_0'),
(re.compile(r'<div class="moderation"><img src="http://tweakimg.net/g/if/comments/score_1'), lambda match : ' - moderated +1<div class="moderation"><img src="http://tweakimg.net/g/if/comments/score_1'),
(re.compile(r'<div class="moderation"><img src="http://tweakimg.net/g/if/comments/score_2'), lambda match : ' - moderated +2<div class="moderation"><img src="http://tweakimg.net/g/if/comments/score_2'),
(re.compile(r'<div class="moderation"><img src="http://tweakimg.net/g/if/comments/score_3'), lambda match : ' - moderated +3<div class="moderation"><img src="http://tweakimg.net/g/if/comments/score_3'),
(re.compile(r'<div class="moderation">.*?</div>'), lambda h1: ''),
]
feeds = [(u'Tweakers.net', u'http://tweakers.net/feeds/nieuws.xml')]
extra_css = '.reactieHeader { color: #333333; font-size: 6px; border-bottom:solid 2px #333333; border-top:solid 1px #333333; } \
.reactieContent { font-family:"Times New Roman",Georgia,Serif; color: #000000; font-size: 8px; } \
.quote { font-family:"Times New Roman",Georgia,Serif; padding-left:2px; border-left:solid 3px #666666; color: #666666; }'
feeds = [(u'Tweakers.net', u'http://feeds.feedburner.com/tweakers/nieuws')]
def print_version(self, url):
return url + '?max=200'
def preprocess_html(self, soup):
for a in soup.findAll('a', href=True, rel=True):
if a['rel'].startswith('imageview'):
a['src'] = a['href']
del a['href']
a.name = 'img'
for x in a.findAll(True):
x.extract()
return soup
def postprocess_html(self, soup, first):
for base in soup.findAll('base'):
base.extract()
return soup

19
recipes/vignette.recipe Normal file
View File

@ -0,0 +1,19 @@
from calibre.web.feeds.news import BasicNewsRecipe
class AdvancedUserRecipe1334935485(BasicNewsRecipe):
title = u'Vignette'
oldest_article = 15
max_articles_per_feed = 100
auto_cleanup = False
keep_only_tags = [
dict(name='div', attrs={'class':['HomeFirstNewsfoto', 'photo']}),
dict(name='img', attrs={'class':'altan-big'})
]
masthead_url = 'http://vauro.globalist.it/vauroglobalistit/Img/vauro-logo-beta.gif'
feeds = [(u'Altan', u'http://feed43.com/3556647724071522.xml'), (u'Ellekappa', u'http://ellekappa.tumblr.com/rss'), (u'Vauro', u'http://feeds.feedburner.com/vauro')]
description = 'Ellekappa, Altan, Vauro - Italian best satirical cartoons'
language = 'it'
__author__ = 'faber1971'
__version__ = 'v1.0'
__date__ = '24, April 2012'

View File

@ -8,6 +8,7 @@ class webhosting_pl(BasicNewsRecipe):
cover_url='http://webhosting.pl/images/logo.png'
masthead_url='http://webhosting.pl/images/logo.png'
oldest_article = 7
index='http://webhosting.pl'
max_articles_per_feed = 100
no_stylesheets = True
remove_empty_feeds = True
@ -36,4 +37,10 @@ class webhosting_pl(BasicNewsRecipe):
(u'Marketing', u'http://webhosting.pl/feed/rss/n/11535')]
def print_version(self, url):
return url.replace('webhosting.pl', 'webhosting.pl/print')
return url.replace('webhosting.pl', 'webhosting.pl/print')
def preprocess_html(self, soup):
for a in soup('a'):
if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']:
a['href']=self.index + a['href']
return soup

View File

@ -21,7 +21,7 @@ class XkcdCom(BasicNewsRecipe):
use_embedded_content = False
oldest_article = 60
keep_only_tags = [dict(id='middleContent')]
keep_only_tags = [dict(id='middleContainer')]
remove_tags = [dict(name='ul'), dict(name='h3'), dict(name='br')]
no_stylesheets = True
# turn image bubblehelp into a paragraph

View File

@ -377,7 +377,7 @@
<xsl:apply-templates/><br/>
</xsl:template>
<!-- image -->
<xsl:template match="fb:image">
<xsl:template match="fb:body/fb:image|fb:section/fb:image">
<div align="center">
<xsl:element name="img">
<xsl:attribute name="border">1</xsl:attribute>
@ -395,4 +395,20 @@
</xsl:element>
</div>
</xsl:template>
<xsl:template match="fb:image">
<xsl:element name="img">
<xsl:choose>
<xsl:when test="starts-with(@xlink:href,'#')">
<xsl:attribute name="src"><xsl:value-of select="substring-after(@xlink:href,'#')"/></xsl:attribute>
</xsl:when>
<xsl:otherwise>
<xsl:attribute name="src"><xsl:value-of select="@xlink:href"/></xsl:attribute>
</xsl:otherwise>
</xsl:choose>
<xsl:if test="@title">
<xsl:attribute name="title"><xsl:value-of select="@title"/></xsl:attribute>
</xsl:if>
</xsl:element>
</xsl:template>
</xsl:stylesheet>

View File

@ -26,7 +26,7 @@ def login_to_google(username, password):
br.form['Email'] = username
br.form['Passwd'] = password
raw = br.submit().read()
if re.search(br'<title>.*?Account Settings</title>', raw) is None:
if re.search(br'(?i)<title>.*?Account Settings</title>', raw) is None:
x = re.search(br'(?is)<title>.*?</title>', raw)
if x is not None:
print ('Title of post login page: %s'%x.group())

View File

@ -12,14 +12,14 @@ msgstr ""
"Report-Msgid-Bugs-To: Debian iso-codes team <pkg-isocodes-"
"devel@lists.alioth.debian.org>\n"
"POT-Creation-Date: 2011-11-25 14:01+0000\n"
"PO-Revision-Date: 2011-12-14 19:48+0000\n"
"Last-Translator: Ferran Rius <frius64@hotmail.com>\n"
"PO-Revision-Date: 2012-04-12 09:56+0000\n"
"Last-Translator: Dídac Rios <didac@niorcs.com>\n"
"Language-Team: Catalan <linux@softcatala.org>\n"
"MIME-Version: 1.0\n"
"Content-Type: text/plain; charset=UTF-8\n"
"Content-Transfer-Encoding: 8bit\n"
"X-Launchpad-Export-Date: 2011-12-15 05:18+0000\n"
"X-Generator: Launchpad (build 14487)\n"
"X-Launchpad-Export-Date: 2012-04-13 05:26+0000\n"
"X-Generator: Launchpad (build 15070)\n"
"Language: ca\n"
#. name for aaa
@ -9584,31 +9584,31 @@ msgstr ""
#. name for hoi
msgid "Holikachuk"
msgstr ""
msgstr "Holikachuk"
#. name for hoj
msgid "Hadothi"
msgstr ""
msgstr "Hadothi"
#. name for hol
msgid "Holu"
msgstr ""
msgstr "Holu"
#. name for hom
msgid "Homa"
msgstr ""
msgstr "Homa"
#. name for hoo
msgid "Holoholo"
msgstr ""
msgstr "Holoholo"
#. name for hop
msgid "Hopi"
msgstr ""
msgstr "Hopi"
#. name for hor
msgid "Horo"
msgstr ""
msgstr "Horo"
#. name for hos
msgid "Ho Chi Minh City Sign Language"
@ -9616,15 +9616,15 @@ msgstr "Llenguatge de signes de la ciutat de Ho Chi Minh"
#. name for hot
msgid "Hote"
msgstr ""
msgstr "Hote"
#. name for hov
msgid "Hovongan"
msgstr ""
msgstr "Hovongan"
#. name for how
msgid "Honi"
msgstr ""
msgstr "Honi"
#. name for hoy
msgid "Holiya"
@ -9636,7 +9636,7 @@ msgstr ""
#. name for hpo
msgid "Hpon"
msgstr ""
msgstr "Hpon"
#. name for hps
msgid "Hawai'i Pidgin Sign Language"
@ -9644,35 +9644,35 @@ msgstr "Hawaià Pidgin; llenguatge de signes"
#. name for hra
msgid "Hrangkhol"
msgstr ""
msgstr "Hrangkhol"
#. name for hre
msgid "Hre"
msgstr ""
msgstr "Hre"
#. name for hrk
msgid "Haruku"
msgstr ""
msgstr "Haruku"
#. name for hrm
msgid "Miao; Horned"
msgstr ""
msgstr "Miao; Horned"
#. name for hro
msgid "Haroi"
msgstr ""
msgstr "Haroi"
#. name for hrr
msgid "Horuru"
msgstr ""
msgstr "Horuru"
#. name for hrt
msgid "Hértevin"
msgstr ""
msgstr "Hértevin"
#. name for hru
msgid "Hruso"
msgstr ""
msgstr "Hruso"
#. name for hrv
msgid "Croatian"
@ -9680,11 +9680,11 @@ msgstr "Croat"
#. name for hrx
msgid "Hunsrik"
msgstr ""
msgstr "Hunsrik"
#. name for hrz
msgid "Harzani"
msgstr ""
msgstr "Harzani"
#. name for hsb
msgid "Sorbian; Upper"
@ -9704,31 +9704,31 @@ msgstr "Xinès; Xiang"
#. name for hss
msgid "Harsusi"
msgstr ""
msgstr "Harsusi"
#. name for hti
msgid "Hoti"
msgstr ""
msgstr "Hoti"
#. name for hto
msgid "Huitoto; Minica"
msgstr ""
msgstr "Huitoto; Minica"
#. name for hts
msgid "Hadza"
msgstr ""
msgstr "Hadza"
#. name for htu
msgid "Hitu"
msgstr ""
msgstr "Hitu"
#. name for htx
msgid "Hittite; Middle"
msgstr ""
msgstr "Hittite; Middle"
#. name for hub
msgid "Huambisa"
msgstr ""
msgstr "Huambisa"
#. name for huc
msgid "=/Hua"
@ -9736,27 +9736,27 @@ msgstr ""
#. name for hud
msgid "Huaulu"
msgstr ""
msgstr "Huaulu"
#. name for hue
msgid "Huave; San Francisco Del Mar"
msgstr ""
msgstr "Huave; San Francisco Del Mar"
#. name for huf
msgid "Humene"
msgstr ""
msgstr "Humene"
#. name for hug
msgid "Huachipaeri"
msgstr ""
msgstr "Huachipaeri"
#. name for huh
msgid "Huilliche"
msgstr ""
msgstr "Huilliche"
#. name for hui
msgid "Huli"
msgstr ""
msgstr "Huli"
#. name for huj
msgid "Miao; Northern Guiyang"
@ -9764,15 +9764,15 @@ msgstr "Miao; Guiyang septentrional"
#. name for huk
msgid "Hulung"
msgstr ""
msgstr "Hulung"
#. name for hul
msgid "Hula"
msgstr ""
msgstr "Hula"
#. name for hum
msgid "Hungana"
msgstr ""
msgstr "Hungana"
#. name for hun
msgid "Hungarian"
@ -9780,43 +9780,43 @@ msgstr "Hongarès"
#. name for huo
msgid "Hu"
msgstr ""
msgstr "Hu"
#. name for hup
msgid "Hupa"
msgstr ""
msgstr "Hupa"
#. name for huq
msgid "Tsat"
msgstr ""
msgstr "Tsat"
#. name for hur
msgid "Halkomelem"
msgstr ""
msgstr "Halkomelem"
#. name for hus
msgid "Huastec"
msgstr ""
msgstr "Huastec"
#. name for hut
msgid "Humla"
msgstr ""
msgstr "Humla"
#. name for huu
msgid "Huitoto; Murui"
msgstr ""
msgstr "Huitoto; Murui"
#. name for huv
msgid "Huave; San Mateo Del Mar"
msgstr ""
msgstr "Huave; San Mateo Del Mar"
#. name for huw
msgid "Hukumina"
msgstr ""
msgstr "Hukumina"
#. name for hux
msgid "Huitoto; Nüpode"
msgstr ""
msgstr "Huitoto; Nüpode"
#. name for huy
msgid "Hulaulá"

View File

@ -18,27 +18,27 @@ msgstr ""
"Report-Msgid-Bugs-To: Debian iso-codes team <pkg-isocodes-"
"devel@lists.alioth.debian.org>\n"
"POT-Creation-Date: 2011-11-25 14:01+0000\n"
"PO-Revision-Date: 2012-03-05 19:08+0000\n"
"Last-Translator: Dennis Baudys <Unknown>\n"
"PO-Revision-Date: 2012-04-21 14:42+0000\n"
"Last-Translator: SimonFS <simonschuette@arcor.de>\n"
"Language-Team: German <debian-l10n-german@lists.debian.org>\n"
"MIME-Version: 1.0\n"
"Content-Type: text/plain; charset=UTF-8\n"
"Content-Transfer-Encoding: 8bit\n"
"X-Launchpad-Export-Date: 2012-03-06 04:47+0000\n"
"X-Generator: Launchpad (build 14900)\n"
"X-Launchpad-Export-Date: 2012-04-22 04:43+0000\n"
"X-Generator: Launchpad (build 15120)\n"
"Language: de\n"
#. name for aaa
msgid "Ghotuo"
msgstr ""
msgstr "Ghotuo (Nigeria)"
#. name for aab
msgid "Alumu-Tesu"
msgstr "Alumu-Tesu"
msgstr "Alumu-Tesu (Nigeria)"
#. name for aac
msgid "Ari"
msgstr "Ari"
msgstr "Ari (Papua-Neuguinea)"
#. name for aad
msgid "Amal"
@ -66,7 +66,7 @@ msgstr "Arifama-Miniafia"
#. name for aak
msgid "Ankave"
msgstr "Ankave"
msgstr "Ankave (Papua-Neuguinea)"
#. name for aal
msgid "Afade"
@ -110,7 +110,7 @@ msgstr ""
#. name for aaw
msgid "Solong"
msgstr ""
msgstr "Solong"
#. name for aax
msgid "Mandobo Atas"
@ -30860,7 +30860,7 @@ msgstr ""
#. name for zxx
msgid "No linguistic content"
msgstr ""
msgstr "Kein linguistischer Inhalt"
#. name for zyb
msgid "Zhuang; Yongbei"

File diff suppressed because it is too large Load Diff

View File

@ -9,14 +9,14 @@ msgstr ""
"Report-Msgid-Bugs-To: Debian iso-codes team <pkg-isocodes-"
"devel@lists.alioth.debian.org>\n"
"POT-Creation-Date: 2011-11-25 14:01+0000\n"
"PO-Revision-Date: 2012-03-06 13:55+0000\n"
"PO-Revision-Date: 2012-04-18 13:08+0000\n"
"Last-Translator: Asier Iturralde Sarasola <Unknown>\n"
"Language-Team: Euskara <itzulpena@comtropos.com>\n"
"MIME-Version: 1.0\n"
"Content-Type: text/plain; charset=UTF-8\n"
"Content-Transfer-Encoding: 8bit\n"
"X-Launchpad-Export-Date: 2012-03-07 05:12+0000\n"
"X-Generator: Launchpad (build 14907)\n"
"X-Launchpad-Export-Date: 2012-04-19 04:36+0000\n"
"X-Generator: Launchpad (build 15108)\n"
"Language: eu\n"
#. name for aaa
@ -27125,7 +27125,7 @@ msgstr ""
#. name for vie
msgid "Vietnamese"
msgstr "Mahastiak"
msgstr "Vietnamera"
#. name for vif
msgid "Vili"

View File

@ -10,14 +10,14 @@ msgstr ""
"Report-Msgid-Bugs-To: Debian iso-codes team <pkg-isocodes-"
"devel@lists.alioth.debian.org>\n"
"POT-Creation-Date: 2011-11-25 14:01+0000\n"
"PO-Revision-Date: 2011-11-11 00:16+0000\n"
"PO-Revision-Date: 2012-04-22 07:11+0000\n"
"Last-Translator: kulkke <Unknown>\n"
"Language-Team: Turkish <gnome-turk@gnome.org>\n"
"MIME-Version: 1.0\n"
"Content-Type: text/plain; charset=UTF-8\n"
"Content-Transfer-Encoding: 8bit\n"
"X-Launchpad-Export-Date: 2011-11-26 05:42+0000\n"
"X-Generator: Launchpad (build 14381)\n"
"X-Launchpad-Export-Date: 2012-04-23 04:45+0000\n"
"X-Generator: Launchpad (build 15135)\n"
"Language: tr\n"
#. name for aaa
@ -7371,7 +7371,7 @@ msgstr ""
#. name for est
msgid "Estonian"
msgstr "Estonyaca"
msgstr "Estonca"
#. name for esu
msgid "Yupik; Central"
@ -11131,7 +11131,7 @@ msgstr ""
#. name for kaz
msgid "Kazakh"
msgstr "Kazak Dili"
msgstr "Kazakça"
#. name for kba
msgid "Kalarko"
@ -13767,7 +13767,7 @@ msgstr ""
#. name for lav
msgid "Latvian"
msgstr "Letonyaca"
msgstr "Letonca"
#. name for law
msgid "Lauje"
@ -16031,7 +16031,7 @@ msgstr ""
#. name for mkd
msgid "Macedonian"
msgstr "Makedonyaca"
msgstr "Makedonca"
#. name for mke
msgid "Mawchi"
@ -22227,7 +22227,7 @@ msgstr ""
#. name for ron
msgid "Romanian"
msgstr "Romence"
msgstr "Rumence"
#. name for roo
msgid "Rotokas"

View File

@ -4,7 +4,7 @@ __license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
__docformat__ = 'restructuredtext en'
__appname__ = u'calibre'
numeric_version = (0, 8, 46)
numeric_version = (0, 8, 49)
__version__ = u'.'.join(map(unicode, numeric_version))
__author__ = u"Kovid Goyal <kovid@kovidgoyal.net>"

View File

@ -259,7 +259,7 @@ class LRXMetadataReader(MetadataReaderPlugin):
class MOBIMetadataReader(MetadataReaderPlugin):
name = 'Read MOBI metadata'
file_types = set(['mobi', 'prc', 'azw', 'azw4', 'pobi'])
file_types = set(['mobi', 'prc', 'azw', 'azw3', 'azw4', 'pobi'])
description = _('Read metadata from %s files')%'MOBI'
def get_metadata(self, stream, ftype):

View File

@ -10,6 +10,8 @@ import cStringIO
from calibre.devices.usbms.driver import USBMS
HTC_BCDS = [0x100, 0x0222, 0x0226, 0x227, 0x228]
class ANDROID(USBMS):
name = 'Android driver'
@ -23,23 +25,24 @@ class ANDROID(USBMS):
VENDOR_ID = {
# HTC
0x0bb4 : { 0xc02 : [0x100, 0x0227, 0x0226, 0x222],
0xc01 : [0x100, 0x0227, 0x0226],
0xff9 : [0x0100, 0x0227, 0x0226],
0xc86 : [0x100, 0x0227, 0x0226, 0x222],
0xc87 : [0x0100, 0x0227, 0x0226],
0xc8d : [0x100, 0x0227, 0x0226, 0x222],
0xc91 : [0x0100, 0x0227, 0x0226],
0xc92 : [0x100, 0x0227, 0x0226, 0x222],
0xc97 : [0x100, 0x0227, 0x0226, 0x222],
0xc99 : [0x100, 0x0227, 0x0226, 0x222],
0xca2 : [0x100, 0x0227, 0x0226, 0x222],
0xca3 : [0x100, 0x0227, 0x0226, 0x222],
0xca4 : [0x100, 0x0227, 0x0226, 0x222],
0xca9 : [0x100, 0x0227, 0x0226, 0x222],
0xcac : [0x100, 0x0227, 0x0226, 0x222],
0xccf : [0x100, 0x0227, 0x0226, 0x222],
0x2910 : [0x222],
0x0bb4 : { 0xc02 : HTC_BCDS,
0xc01 : HTC_BCDS,
0xff9 : HTC_BCDS,
0xc86 : HTC_BCDS,
0xc87 : HTC_BCDS,
0xc8d : HTC_BCDS,
0xc91 : HTC_BCDS,
0xc92 : HTC_BCDS,
0xc97 : HTC_BCDS,
0xc99 : HTC_BCDS,
0xca2 : HTC_BCDS,
0xca3 : HTC_BCDS,
0xca4 : HTC_BCDS,
0xca9 : HTC_BCDS,
0xcac : HTC_BCDS,
0xccf : HTC_BCDS,
0x2910 : HTC_BCDS,
0xff9 : [0x9999],
},
# Eken
@ -174,7 +177,7 @@ class ANDROID(USBMS):
'TELECHIP', 'HUAWEI', 'T-MOBILE', 'SEMC', 'LGE', 'NVIDIA',
'GENERIC-', 'ZTE', 'MID', 'QUALCOMM', 'PANDIGIT', 'HYSTON',
'VIZIO', 'GOOGLE', 'FREESCAL', 'KOBO_INC', 'LENOVO', 'ROCKCHIP',
'POCKET', 'ONDA_MID', 'ZENITHIN', 'INGENIC']
'POCKET', 'ONDA_MID', 'ZENITHIN', 'INGENIC', 'PMID701C']
WINDOWS_MAIN_MEM = ['ANDROID_PHONE', 'A855', 'A853', 'INC.NEXUS_ONE',
'__UMS_COMPOSITE', '_MB200', 'MASS_STORAGE', '_-_CARD', 'SGH-I897',
'GT-I9000', 'FILE-STOR_GADGET', 'SGH-T959_CARD', 'SGH-T959', 'SAMSUNG_ANDROID',
@ -189,7 +192,8 @@ class ANDROID(USBMS):
'UMS', '.K080', 'P990', 'LTE', 'MB853', 'GT-S5660_CARD', 'A107',
'GT-I9003_CARD', 'XT912', 'FILE-CD_GADGET', 'RK29_SDK', 'MB855',
'XT910', 'BOOK_A10', 'USB_2.0_DRIVER', 'I9100T', 'P999DW',
'KTABLET_PC', 'INGENIC', 'GT-I9001_CARD']
'KTABLET_PC', 'INGENIC', 'GT-I9001_CARD', 'USB_2.0_DRIVER',
'GT-S5830L_CARD']
WINDOWS_CARD_A_MEM = ['ANDROID_PHONE', 'GT-I9000_CARD', 'SGH-I897',
'FILE-STOR_GADGET', 'SGH-T959_CARD', 'SGH-T959', 'SAMSUNG_ANDROID', 'GT-P1000_CARD',
'A70S', 'A101IT', '7', 'INCREDIBLE', 'A7EB', 'SGH-T849_CARD',
@ -197,7 +201,7 @@ class ANDROID(USBMS):
'ANDROID_MID', 'P990_SD_CARD', '.K080', 'LTE_CARD', 'MB853',
'A1-07___C0541A4F', 'XT912', 'MB855', 'XT910', 'BOOK_A10_CARD',
'USB_2.0_DRIVER', 'I9100T', 'P999DW_SD_CARD', 'KTABLET_PC',
'FILE-CD_GADGET', 'GT-I9001_CARD']
'FILE-CD_GADGET', 'GT-I9001_CARD', 'USB_2.0_DRIVER']
OSX_MAIN_MEM = 'Android Device Main Memory'

View File

@ -325,6 +325,10 @@ class KINDLE2(KINDLE):
OPT_APNX_ACCURATE = 1
OPT_APNX_CUST_COL = 2
def formats_to_scan_for(self):
ans = USBMS.formats_to_scan_for(self) | {'azw3'}
return ans
def books(self, oncard=None, end_session=True):
bl = USBMS.books(self, oncard=oncard, end_session=end_session)
# Read collections information
@ -423,6 +427,8 @@ class KINDLE_FIRE(KINDLE2):
name = 'Kindle Fire Device Interface'
description = _('Communicate with the Kindle Fire')
gui_name = 'Fire'
FORMATS = list(KINDLE2.FORMATS)
FORMATS.insert(0, 'azw3')
PRODUCT_ID = [0x0006]
BCD = [0x216, 0x100]

View File

@ -298,7 +298,7 @@ class KOBO(USBMS):
changed = False
for i, row in enumerate(cursor):
# self.report_progress((i+1) / float(numrows), _('Getting list of books on device...'))
if row[3].startswith("file:///usr/local/Kobo/help/"):
if not hasattr(row[3], 'startswith') or row[3].startswith("file:///usr/local/Kobo/help/"):
# These are internal to the Kobo device and do not exist
continue
path = self.path_from_contentid(row[3], row[5], row[4], oncard)

View File

@ -86,7 +86,8 @@ class NOOK_COLOR(NOOK):
PRODUCT_ID = [0x002, 0x003, 0x004]
BCD = [0x216]
WINDOWS_MAIN_MEM = WINDOWS_CARD_A_MEM = ['EBOOK_DISK', 'NOOK_TABLET']
WINDOWS_MAIN_MEM = WINDOWS_CARD_A_MEM = ['EBOOK_DISK', 'NOOK_TABLET',
'NOOK_SIMPLETOUCH']
EBOOK_DIR_MAIN = 'My Files'
NEWS_IN_FOLDER = False

View File

@ -307,11 +307,21 @@ class PRST1(USBMS):
# Work-around for Sony Bug (SD Card DB not using right SQLite sequence)
if source_id == 1:
# Update any existing sequence numbers in the table that aren't in the required range
sdcard_sequence_start = '4294967296'
query = 'UPDATE sqlite_sequence SET seq = ? WHERE seq < ?'
t = (sdcard_sequence_start, sdcard_sequence_start,)
cursor.execute(query, t)
# Insert sequence numbers for tables we will be manipulating, if they don't already exist
query = ('INSERT INTO sqlite_sequence (name, seq) '
'SELECT ?, ? '
'WHERE NOT EXISTS (SELECT 1 FROM sqlite_sequence WHERE name = ?)');
cursor.execute(query, ('books',sdcard_sequence_start,'books',))
cursor.execute(query, ('collection',sdcard_sequence_start,'collection',))
cursor.execute(query, ('collections',sdcard_sequence_start,'collections',))
for book in booklist:
# Run through plugboard if needed
if plugboard is not None:

View File

@ -128,6 +128,9 @@ class USBMS(CLI, Device):
elif location_code == 'B':
self._update_driveinfo_file(self._card_b_prefix, location_code, name)
def formats_to_scan_for(self):
return set(self.settings().format_map) | set(self.FORMATS)
def books(self, oncard=None, end_session=True):
from calibre.ebooks.metadata.meta import path_to_ext
@ -166,7 +169,7 @@ class USBMS(CLI, Device):
for idx,b in enumerate(bl):
bl_cache[b.lpath] = idx
all_formats = set(self.settings().format_map) | set(self.FORMATS)
all_formats = self.formats_to_scan_for()
def update_booklist(filename, path, prefix):
changed = False

View File

@ -31,7 +31,7 @@ BOOK_EXTENSIONS = ['lrf', 'rar', 'zip', 'rtf', 'lit', 'txt', 'txtz', 'text', 'ht
'epub', 'fb2', 'djv', 'djvu', 'lrx', 'cbr', 'cbz', 'cbc', 'oebzip',
'rb', 'imp', 'odt', 'chm', 'tpz', 'azw1', 'pml', 'pmlz', 'mbp', 'tan', 'snb',
'xps', 'oxps', 'azw4', 'book', 'zbf', 'pobi', 'docx', 'md',
'textile', 'markdown', 'ibook', 'iba']
'textile', 'markdown', 'ibook', 'iba', 'azw3']
class HTMLRenderer(object):
@ -93,6 +93,20 @@ def extract_calibre_cover(raw, base, log):
if os.path.exists(img):
return open(img, 'rb').read()
# Look for a simple cover, i.e. a body with no text and only one <img> tag
if matches is None:
body = soup.find('body')
if body is not None:
text = u''.join(map(unicode, body.findAll(text=True)))
if text.strip():
# Body has text, abort
return
images = body.findAll('img', src=True)
if 0 < len(images) < 2:
img = os.path.join(base, *images[0]['src'].split('/'))
if os.path.exists(img):
return open(img, 'rb').read()
def render_html_svg_workaround(path_to_html, log, width=590, height=750):
from calibre.ebooks.oeb.base import SVG_NS
raw = open(path_to_html, 'rb').read()
@ -108,6 +122,7 @@ def render_html_svg_workaround(path_to_html, log, width=590, height=750):
data = extract_calibre_cover(raw, os.path.dirname(path_to_html), log)
except:
pass
if data is None:
renderer = render_html(path_to_html, width, height)
data = getattr(renderer, 'data', None)

View File

@ -156,9 +156,10 @@ def add_pipeline_options(parser, plumber):
'SEARCH AND REPLACE' : (
_('Modify the document text and structure using user defined patterns.'),
[
'sr1_search', 'sr1_replace',
'sr2_search', 'sr2_replace',
'sr3_search', 'sr3_replace',
'sr1_search', 'sr1_replace',
'sr2_search', 'sr2_replace',
'sr3_search', 'sr3_replace',
'search_replace',
]
),
@ -211,6 +212,7 @@ def add_pipeline_options(parser, plumber):
if rec.level < rec.HIGH:
option_recommendation_to_cli_option(add_option, rec)
def option_parser():
parser = OptionParser(usage=USAGE)
parser.add_option('--list-recipes', default=False, action='store_true',
@ -271,6 +273,34 @@ def abspath(x):
return x
return os.path.abspath(os.path.expanduser(x))
def read_sr_patterns(path, log=None):
import json, re, codecs
pats = []
with codecs.open(path, 'r', 'utf-8') as f:
pat = None
for line in f.readlines():
if line.endswith(u'\n'):
line = line[:-1]
if pat is None:
if not line.strip():
continue
try:
re.compile(line)
except:
msg = u'Invalid regular expression: %r from file: %r'%(
line, path)
if log is not None:
log.error(msg)
raise SystemExit(1)
else:
raise ValueError(msg)
pat = line
else:
pats.append((pat, line))
pat = None
return json.dumps(pats)
def main(args=sys.argv):
log = Log()
parser, plumber = create_option_parser(args, log)
@ -278,6 +308,9 @@ def main(args=sys.argv):
for x in ('read_metadata_from_opf', 'cover'):
if getattr(opts, x, None) is not None:
setattr(opts, x, abspath(getattr(opts, x)))
if opts.search_replace:
opts.search_replace = read_sr_patterns(opts.search_replace, log)
recommendations = [(n.dest, getattr(opts, n.dest),
OptionRecommendation.HIGH) \
for n in parser.options_iter()

View File

@ -7,41 +7,17 @@ import os
from calibre.customize.conversion import InputFormatPlugin
def run_mobi_unpack(stream, options, log, accelerators):
from mobiunpack.mobi_unpack import Mobi8Reader
from calibre.customize.ui import plugin_for_input_format
from calibre.ptempfile import PersistentTemporaryDirectory
wdir = PersistentTemporaryDirectory('_unpack_space')
m8r = Mobi8Reader(stream, wdir)
if m8r.isK8():
epub_path = m8r.processMobi8()
epub_input = plugin_for_input_format('epub')
for opt in epub_input.options:
setattr(options, opt.option.name, opt.recommended_value)
options.input_encoding = m8r.getCodec()
return epub_input.convert(open(epub_path,'rb'), options,
'epub', log, accelerators)
class MOBIInput(InputFormatPlugin):
name = 'MOBI Input'
author = 'Kovid Goyal'
description = 'Convert MOBI files (.mobi, .prc, .azw) to HTML'
file_types = set(['mobi', 'prc', 'azw'])
file_types = set(['mobi', 'prc', 'azw', 'azw3'])
def convert(self, stream, options, file_ext, log,
accelerators):
self.is_kf8 = False
if os.environ.get('USE_MOBIUNPACK', None) is not None:
pos = stream.tell()
try:
return run_mobi_unpack(stream, options, log, accelerators)
except Exception:
log.exception('mobi_unpack code not working')
stream.seek(pos)
from calibre.ebooks.mobi.reader.mobi6 import MobiReader
from lxml import html
parse_cache = {}

View File

@ -6,8 +6,6 @@ __license__ = 'GPL v3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
from cStringIO import StringIO
from calibre.customize.conversion import OutputFormatPlugin
from calibre.customize.conversion import OptionRecommendation
@ -79,18 +77,9 @@ class MOBIOutput(OutputFormatPlugin):
def check_for_masthead(self):
found = 'masthead' in self.oeb.guide
if not found:
from calibre.ebooks import generate_masthead
self.oeb.log.debug('No masthead found in manifest, generating default mastheadImage...')
try:
from PIL import Image as PILImage
PILImage
except ImportError:
import Image as PILImage
raw = open(P('content_server/calibre_banner.png'), 'rb')
im = PILImage.open(raw)
of = StringIO()
im.save(of, 'GIF')
raw = of.getvalue()
raw = generate_masthead(unicode(self.oeb.metadata['title'][0]))
id, href = self.oeb.manifest.generate('masthead', 'masthead')
self.oeb.manifest.add(id, href, 'image/gif', data=raw)
self.oeb.guide.add('masthead', 'Masthead Image', href)
@ -151,13 +140,70 @@ class MOBIOutput(OutputFormatPlugin):
# Fix up the periodical href to point to first section href
toc.nodes[0].href = toc.nodes[0].nodes[0].href
def remove_html_cover(self):
from calibre.ebooks.oeb.base import OEB_DOCS
oeb = self.oeb
if not oeb.metadata.cover \
or 'cover' not in oeb.guide:
return
href = oeb.guide['cover'].href
del oeb.guide['cover']
item = oeb.manifest.hrefs[href]
if item.spine_position is not None:
self.log.warn('Found an HTML cover: ', item.href, 'removing it.',
'If you find some content missing from the output MOBI, it '
'is because you misidentified the HTML cover in the input '
'document')
oeb.spine.remove(item)
if item.media_type in OEB_DOCS:
self.oeb.manifest.remove(item)
def convert(self, oeb, output_path, input_plugin, opts, log):
from calibre.utils.config import tweaks
from calibre.ebooks.mobi.writer2.resources import Resources
self.log, self.opts, self.oeb = log, opts, oeb
mobi_type = tweaks.get('test_mobi_output_type', 'old')
if self.is_periodical:
mobi_type = 'old' # Amazon does not support KF8 periodicals
create_kf8 = mobi_type in ('new', 'both')
self.remove_html_cover()
resources = Resources(oeb, opts, self.is_periodical,
add_fonts=create_kf8)
self.check_for_periodical()
if create_kf8:
# Split on pagebreaks so that the resulting KF8 works better with
# calibre's viewer, which does not support CSS page breaks
from calibre.ebooks.oeb.transforms.split import Split
Split()(self.oeb, self.opts)
kf8 = self.create_kf8(resources, for_joint=mobi_type=='both'
) if create_kf8 else None
if mobi_type == 'new':
kf8.write(output_path)
self.extract_mobi(output_path, opts)
return
self.log('Creating MOBI 6 output')
self.write_mobi(input_plugin, output_path, kf8, resources)
def create_kf8(self, resources, for_joint=False):
from calibre.ebooks.mobi.writer8.main import create_kf8_book
return create_kf8_book(self.oeb, self.opts, resources,
for_joint=for_joint)
def write_mobi(self, input_plugin, output_path, kf8, resources):
from calibre.ebooks.mobi.mobiml import MobiMLizer
from calibre.ebooks.oeb.transforms.manglecase import CaseMangler
from calibre.ebooks.oeb.transforms.rasterize import SVGRasterizer, Unavailable
from calibre.ebooks.oeb.transforms.htmltoc import HTMLTOCAdder
from calibre.customize.ui import plugin_for_input_format
opts, oeb = self.opts, self.oeb
if not opts.no_inline_toc:
tocadder = HTMLTOCAdder(title=opts.toc_title, position='start' if
opts.mobi_toc_at_start else 'end')
@ -169,15 +215,19 @@ class MOBIOutput(OutputFormatPlugin):
rasterizer(oeb, opts)
except Unavailable:
self.log.warn('SVG rasterizer unavailable, SVG will not be converted')
else:
# Add rasterized SVG images
resources.add_extra_images()
mobimlizer = MobiMLizer(ignore_tables=opts.linearize_tables)
mobimlizer(oeb, opts)
self.check_for_periodical()
write_page_breaks_after_item = input_plugin is not plugin_for_input_format('cbz')
from calibre.ebooks.mobi.writer2.main import MobiWriter
writer = MobiWriter(opts,
writer = MobiWriter(opts, resources, kf8,
write_page_breaks_after_item=write_page_breaks_after_item)
writer(oeb, output_path)
self.extract_mobi(output_path, opts)
def extract_mobi(self, output_path, opts):
if opts.extract_to is not None:
from calibre.ebooks.mobi.debug.main import inspect_mobi
ddir = opts.extract_to

View File

@ -536,7 +536,7 @@ OptionRecommendation(name='pubdate',
OptionRecommendation(name='timestamp',
recommended_value=None, level=OptionRecommendation.LOW,
help=_('Set the book timestamp (used by the date column in calibre).')),
help=_('Set the book timestamp (no longer used anywhere)')),
OptionRecommendation(name='enable_heuristics',
recommended_value=False, level=OptionRecommendation.LOW,
@ -626,6 +626,14 @@ OptionRecommendation(name='sr3_search',
OptionRecommendation(name='sr3_replace',
recommended_value='', level=OptionRecommendation.LOW,
help=_('Replacement to replace the text found with sr3-search.')),
OptionRecommendation(name='search_replace',
recommended_value=None, level=OptionRecommendation.LOW, help=_(
'Path to a file containing search and replace regular expressions. '
'The file must contain alternating lines of regular expression '
'followed by replacement pattern (which can be an empty line). '
'The regular expression must be in the python regex syntax and '
'the file must be UTF-8 encoded.')),
]
# }}}

View File

@ -5,7 +5,7 @@ __license__ = 'GPL v3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import functools, re
import functools, re, json
from calibre import entity_to_unicode, as_unicode
@ -515,18 +515,31 @@ class HTMLPreProcessor(object):
if not getattr(self.extra_opts, 'keep_ligatures', False):
html = _ligpat.sub(lambda m:LIGATURES[m.group()], html)
for search, replace in [['sr3_search', 'sr3_replace'], ['sr2_search', 'sr2_replace'], ['sr1_search', 'sr1_replace']]:
# Function for processing search and replace
def do_search_replace(search_pattern, replace_txt):
try:
search_re = re.compile(search_pattern)
if not replace_txt:
replace_txt = ''
rules.insert(0, (search_re, replace_txt))
except Exception as e:
self.log.error('Failed to parse %r regexp because %s' %
(search, as_unicode(e)))
# search / replace using the sr?_search / sr?_replace options
for i in range(1, 4):
search, replace = 'sr%d_search'%i, 'sr%d_replace'%i
search_pattern = getattr(self.extra_opts, search, '')
replace_txt = getattr(self.extra_opts, replace, '')
if search_pattern:
try:
search_re = re.compile(search_pattern)
replace_txt = getattr(self.extra_opts, replace, '')
if not replace_txt:
replace_txt = ''
rules.insert(0, (search_re, replace_txt))
except Exception as e:
self.log.error('Failed to parse %r regexp because %s' %
(search, as_unicode(e)))
do_search_replace(search_pattern, replace_txt)
# multi-search / replace using the search_replace option
search_replace = getattr(self.extra_opts, 'search_replace', None)
if search_replace:
search_replace = json.loads(search_replace)
for search_pattern, replace_txt in search_replace:
do_search_replace(search_pattern, replace_txt)
end_rules = []
# delete soft hyphens - moved here so it's executed after header/footer removal
@ -546,7 +559,7 @@ class HTMLPreProcessor(object):
end_rules.append((re.compile(u'(?<=.{%i}[–—])\s*<p>\s*(?=[[a-z\d])' % length), lambda match: ''))
end_rules.append(
# Un wrap using punctuation
(re.compile(u'(?<=.{%i}([a-zäëïöüàèìòùáćéíĺóŕńśúýâêîôûçąężıãõñæøþðßěľščťžňďřů,:“”)\IA\u00DF]|(?<!\&\w{4});))\s*(?P<ital></(i|b|u)>)?\s*(</p>\s*<p>\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines),
(re.compile(u'(?<=.{%i}([a-zäëïöüàèìòùáćéíĺóŕńśúýâêîôûçąężıãõñæøþðßěľščťžňďřů,:)\IA\u00DF]|(?<!\&\w{4});))\s*(?P<ital></(i|b|u)>)?\s*(</p>\s*<p>\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines),
)
for rule in self.PREPROCESS + start_rules:

View File

@ -148,6 +148,7 @@ class HeuristicProcessor(object):
return wordcount.words
def markup_italicis(self, html):
self.log.debug("\n\n\nitalicize debugging \n\n\n")
ITALICIZE_WORDS = [
'Etc.', 'etc.', 'viz.', 'ie.', 'i.e.', 'Ie.', 'I.e.', 'eg.',
'e.g.', 'Eg.', 'E.g.', 'et al.', 'et cetera', 'n.b.', 'N.b.',
@ -156,28 +157,30 @@ class HeuristicProcessor(object):
]
ITALICIZE_STYLE_PATS = [
ur'(?msu)(?<=[\s>"\'])_(?P<words>[^_]+)_',
ur'(?msu)(?<=[\s>"\'])/(?P<words>[^/\*><]+)/',
ur'(?msu)(?<=[\s>"\'])_\*/(?P<words>[^\*_]+)/\*_',
ur'(?msu)(?<=[\s>"\'])~~(?P<words>[^~]+)~~',
ur'(?msu)(?<=[\s>"\'])\*(?P<words>[^\*]+)\*',
ur'(?msu)(?<=[\s>"\'])~(?P<words>[^~]+)~',
ur'(?msu)(?<=[\s>"\'])_/(?P<words>[^/_]+)/_',
ur'(?msu)(?<=[\s>"\'])_\*(?P<words>[^\*_]+)\*_',
ur'(?msu)(?<=[\s>"\'])\*/(?P<words>[^/\*]+)/\*',
ur'(?msu)(?<=[\s>"\'])_\*/(?P<words>[^\*_]+)/\*_',
ur'(?msu)(?<=[\s>"\'])/:(?P<words>[^:/]+):/',
ur'(?msu)(?<=[\s>"\'])\|:(?P<words>[^:\|]+):\|',
ur'(?msu)(?<=[\s>"\'])\*(?P<words>[^\*]+)\*',
ur'(?msu)(?<=[\s>"\'])~(?P<words>[^~]+)~',
ur'(?msu)(?<=[\s>"\'])/(?P<words>[^/\*><]+)/',
ur'(?msu)(?<=[\s>"\'])_(?P<words>[^_]+)_'
]
for word in ITALICIZE_WORDS:
html = re.sub(r'(?<=\s|>)' + re.escape(word) + r'(?=\s|<)', '<i>%s</i>' % word, html)
def sub(mo):
return '<i>%s</i>'%mo.group('words')
search_text = re.sub(r'(?s)<head[^>]*>.*?</head>', '', html)
search_text = re.sub(r'<[^>]*>', '', search_text)
for pat in ITALICIZE_STYLE_PATS:
html = re.sub(pat, sub, html)
for match in re.finditer(pat, search_text):
ital_string = str(match.group('words'))
#self.log.debug("italicising "+str(match.group(0))+" with <i>"+ital_string+"</i>")
html = re.sub(re.escape(str(match.group(0))), '<i>%s</i>' % ital_string, html)
return html
def markup_chapters(self, html, wordcount, blanks_between_paragraphs):
@ -316,13 +319,20 @@ class HeuristicProcessor(object):
'''
Unwraps lines based on line length and punctuation
supports a range of html markup and text files
the lookahead regex below is meant look for any non-full stop characters - punctuation
characters which can be used as a full stop should *not* be added below - e.g. ?!. etc
the reason for this is to prevent false positive wrapping. False positives are more
difficult to detect than false negatives during a manual review of the doc
This function intentionally leaves hyphenated content alone as that is handled by the
dehyphenate routine in a separate step
'''
# define the pieces of the regex
lookahead = "(?<=.{"+str(length)+u"}([a-zäëïöüàèìòùáćéíĺóŕńśúýâêîôûçąężıãõñæøþðßěľščťžňďřů,:“”)\IA\u00DF]|(?<!\&\w{4});))" # (?<!\&\w{4});) is a semicolon not part of an entity
# define the pieces of the regex
lookahead = "(?<=.{"+str(length)+u"}([a-zäëïöüàèìòùáćéíĺóŕńśúýâêîôûçąężıãõñæøþðßěľščťžňďřů,:)\IA\u00DF]|(?<!\&\w{4});))" # (?<!\&\w{4});) is a semicolon not part of an entity
em_en_lookahead = "(?<=.{"+str(length)+u"}[\u2013\u2014])"
soft_hyphen = u"\xad"
dash = u"\x2d" # some ocrs doesn't convert dashes to hyphens
line_ending = "\s*</(span|[iubp]|div)>\s*(</(span|[iubp]|div)>)?"
blanklines = "\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*"
line_opening = "<(span|[iubp]|div)[^>]*>\s*(<(span|[iubp]|div)[^>]*>)?\s*"
@ -331,23 +341,19 @@ class HeuristicProcessor(object):
unwrap_regex = lookahead+line_ending+blanklines+line_opening
em_en_unwrap_regex = em_en_lookahead+line_ending+blanklines+line_opening
shy_unwrap_regex = soft_hyphen+line_ending+blanklines+line_opening
dash_unwrap_regex = dash+line_ending+blanklines+line_opening
if format == 'txt':
unwrap_regex = lookahead+txt_line_wrap
em_en_unwrap_regex = em_en_lookahead+txt_line_wrap
shy_unwrap_regex = soft_hyphen+txt_line_wrap
dash_unwrap_regex = dash+txt_line_wrap
unwrap = re.compile(u"%s" % unwrap_regex, re.UNICODE)
em_en_unwrap = re.compile(u"%s" % em_en_unwrap_regex, re.UNICODE)
shy_unwrap = re.compile(u"%s" % shy_unwrap_regex, re.UNICODE)
dash_unwrap = re.compile(u"%s" % dash_unwrap_regex, re.UNICODE)
content = unwrap.sub(' ', content)
content = em_en_unwrap.sub('', content)
content = shy_unwrap.sub('', content)
content = dash_unwrap.sub('', content)
return content
def txt_process(self, match):
@ -460,27 +466,31 @@ class HeuristicProcessor(object):
return html
def detect_whitespace(self, html):
blanks_around_headings = re.compile(r'(?P<initparas>(<(p|div)[^>]*>\s*</(p|div)>\s*){1,}\s*)?(?P<heading><h(?P<hnum>\d+)[^>]*>.*?</h(?P=hnum)>)(?P<endparas>\s*(<(p|div)[^>]*>\s*</(p|div)>\s*){1,})?', re.IGNORECASE|re.DOTALL)
blanks_around_headings = re.compile(r'(?P<initparas>(<(p|div)[^>]*>\s*</(p|div)>\s*){1,}\s*)?(?P<content><h(?P<hnum>\d+)[^>]*>.*?</h(?P=hnum)>)(?P<endparas>\s*(<(p|div)[^>]*>\s*</(p|div)>\s*){1,})?', re.IGNORECASE|re.DOTALL)
blanks_around_scene_breaks = re.compile(r'(?P<initparas>(<(p|div)[^>]*>\s*</(p|div)>\s*){1,}\s*)?(?P<content><p class="scenebreak"[^>]*>.*?</p>)(?P<endparas>\s*(<(p|div)[^>]*>\s*</(p|div)>\s*){1,})?', re.IGNORECASE|re.DOTALL)
blanks_n_nopunct = re.compile(r'(?P<initparas>(<p[^>]*>\s*</p>\s*){1,}\s*)?<p[^>]*>\s*(<(span|[ibu]|em|strong|font)[^>]*>\s*)*.{1,100}?[^\W](</(span|[ibu]|em|strong|font)>\s*)*</p>(?P<endparas>\s*(<p[^>]*>\s*</p>\s*){1,})?', re.IGNORECASE|re.DOTALL)
def merge_header_whitespace(match):
initblanks = match.group('initparas')
endblanks = match.group('initparas')
heading = match.group('heading')
endblanks = match.group('endparas')
content = match.group('content')
top_margin = ''
bottom_margin = ''
if initblanks is not None:
top_margin = 'margin-top:'+str(len(self.single_blank.findall(initblanks)))+'em;'
if endblanks is not None:
bottom_margin = 'margin-bottom:'+str(len(self.single_blank.findall(initblanks)))+'em;'
bottom_margin = 'margin-bottom:'+str(len(self.single_blank.findall(endblanks)))+'em;'
if initblanks == None and endblanks == None:
return heading
return content
elif content.find('scenebreak') != -1:
return content
else:
heading = re.sub('(?i)<h(?P<hnum>\d+)[^>]*>', '\n\n<h'+'\g<hnum>'+' style="'+top_margin+bottom_margin+'">', heading)
return heading
content = re.sub('(?i)<h(?P<hnum>\d+)[^>]*>', '\n\n<h'+'\g<hnum>'+' style="'+top_margin+bottom_margin+'">', content)
return content
html = blanks_around_headings.sub(merge_header_whitespace, html)
html = blanks_around_scene_breaks.sub(merge_header_whitespace, html)
def markup_whitespaces(match):
blanks = match.group(0)
@ -515,6 +525,12 @@ class HeuristicProcessor(object):
html = self.blankreg.sub('\n<p class="softbreak" style="margin-top:.5em; page-break-before:avoid; text-align:center"> </p>', html)
return html
def detect_scene_breaks(self, html):
scene_break_regex = self.line_open+'(?!('+self.common_in_text_beginnings+'|.*?'+self.common_in_text_endings+'<))(?P<break>((?P<break_char>((?!\s)\W))\s*(?P=break_char)?)+)\s*'+self.line_close
scene_breaks = re.compile(r'%s' % scene_break_regex, re.IGNORECASE|re.UNICODE)
html = scene_breaks.sub(self.scene_break_open+'\g<break>'+'</p>', html)
return html
def markup_user_break(self, replacement_break):
'''
Takes string a user supplies and wraps it in markup that will be centered with
@ -781,25 +797,25 @@ class HeuristicProcessor(object):
if getattr(self.extra_opts, 'format_scene_breaks', False):
self.log.debug('Formatting scene breaks')
html = re.sub('(?i)<div[^>]*>\s*<br(\s?/)?>\s*</div>', '<p></p>', html)
html = self.detect_scene_breaks(html)
html = self.detect_whitespace(html)
html = self.detect_soft_breaks(html)
blanks_count = len(self.any_multi_blank.findall(html))
if blanks_count >= 1:
html = self.merge_blanks(html, blanks_count)
scene_break_regex = self.line_open+'(?!('+self.common_in_text_beginnings+'|.*?'+self.common_in_text_endings+'<))(?P<break>((?P<break_char>((?!\s)\W))\s*(?P=break_char)?)+)\s*'+self.line_close
scene_break = re.compile(r'%s' % scene_break_regex, re.IGNORECASE|re.UNICODE)
detected_scene_break = re.compile(r'<p class="scenebreak"[^>]*>.*?</p>')
scene_break_count = len(detected_scene_break.findall(html))
# If the user has enabled scene break replacement, then either softbreaks
# or 'hard' scene breaks are replaced, depending on which is in use
# Otherwise separator lines are centered, use a bit larger margin in this case
replacement_break = getattr(self.extra_opts, 'replace_scene_breaks', None)
if replacement_break:
replacement_break = self.markup_user_break(replacement_break)
if len(scene_break.findall(html)) >= 1:
html = scene_break.sub(replacement_break, html)
if scene_break_count >= 1:
html = detected_scene_break.sub(replacement_break, html)
html = re.sub('<p\s+class="softbreak"[^>]*>\s*</p>', replacement_break, html)
else:
html = re.sub('<p\s+class="softbreak"[^>]*>\s*</p>', replacement_break, html)
else:
html = scene_break.sub(self.scene_break_open+'\g<break>'+'</p>', html)
if self.deleted_nbsps:
# put back non-breaking spaces in empty paragraphs so they render correctly

View File

@ -18,6 +18,7 @@ from lxml import etree
from calibre import prepare_string_for_xml
from calibre.constants import __appname__, __version__
from calibre.utils.magick import Image
from calibre.utils.localization import lang_as_iso639_1
class FB2MLizer(object):
'''
@ -103,7 +104,10 @@ class FB2MLizer(object):
metadata['version'] = __version__
metadata['date'] = '%i.%i.%i' % (datetime.now().day, datetime.now().month, datetime.now().year)
if self.oeb_book.metadata.language:
metadata['lang'] = self.oeb_book.metadata.language[0].value
lc = lang_as_iso639_1(self.oeb_book.metadata.language[0].value)
if not lc:
lc = self.oeb_book.metadata.language[0].value
metadata['lang'] = lc or 'en'
else:
metadata['lang'] = u'en'
metadata['id'] = None

View File

@ -197,14 +197,18 @@ class OverDrive(Source):
title_tokens = list(self.get_title_tokens(title,
strip_joiners=False, strip_subtitle=True))
if len(title_tokens) >= len(author_tokens):
xref_q = ''
if len(author_tokens) <= 1:
initial_q = ' '.join(title_tokens)
xref_q = '+'.join(author_tokens)
else:
initial_q = ' '.join(author_tokens)
xref_q = '+'.join(title_tokens)
#log.error('Initial query is %s'%initial_q)
#log.error('Cross reference query is %s'%xref_q)
for token in title_tokens:
if len(xref_q) < len(token):
xref_q = token
log.error('Initial query is %s'%initial_q)
log.error('Cross reference query is %s'%xref_q)
q_xref = q+'SearchResults.svc/GetResults?iDisplayLength=50&sSearch='+xref_q
query = '{"szKeyword":"'+initial_q+'"}'
@ -219,27 +223,30 @@ class OverDrive(Source):
# get the search results object
results = False
iterations = 0
while results == False:
iterations += 1
xreq = mechanize.Request(q_xref)
xreq.add_header('X-Requested-With', 'XMLHttpRequest')
xreq.add_header('Referer', q_init_search)
xreq.add_header('Accept', 'application/json, text/javascript, */*')
raw = br.open_novisit(xreq).read()
for m in re.finditer(ur'"iTotalDisplayRecords":(?P<displayrecords>\d+).*?"iTotalRecords":(?P<totalrecords>\d+)', raw):
if int(m.group('displayrecords')) >= 1:
results = True
elif int(m.group('totalrecords')) >= 1:
if int(m.group('totalrecords')) >= 100:
if xref_q.find('+') != -1:
xref_tokens = xref_q.split('+')
xref_q = xref_tokens[0]
#log.error('xref_q is '+xref_q)
else:
xref_q = ''
xref_q = ''
q_xref = q+'SearchResults.svc/GetResults?iDisplayLength=50&sSearch='+xref_q
elif int(m.group('totalrecords')) == 0:
if int(m.group('totalrecords')) == 0:
return ''
elif int(m.group('displayrecords')) >= 1:
results = True
elif int(m.group('totalrecords')) >= 1 and iterations < 3:
if xref_q.find('+') != -1:
xref_tokens = xref_q.split('+')
xref_q = xref_tokens[0]
for token in xref_tokens:
if len(xref_q) < len(token):
xref_q = token
#log.error('rewrote xref_q, new query is '+xref_q)
else:
xref_q = ''
q_xref = q+'SearchResults.svc/GetResults?iDisplayLength=50&sSearch='+xref_q
return self.sort_ovrdrv_results(raw, log, title, title_tokens, author, author_tokens)
@ -263,6 +270,7 @@ class OverDrive(Source):
else:
if creators:
creators = creators.split(', ')
# if an exact match in a preferred format occurs
if ((author and creators and creators[0] == author[0]) or (not author and not creators)) and od_title.lower() == title.lower() and int(formatid) in [1, 50, 410, 900] and thumbimage:
return self.format_results(reserveid, od_title, subtitle, series, publisher,
@ -330,9 +338,9 @@ class OverDrive(Source):
def find_ovrdrv_data(self, br, log, title, author, isbn, ovrdrv_id=None):
q = base_url
if ovrdrv_id is None:
return self.overdrive_search(br, log, q, title, author)
return self.overdrive_search(br, log, q, title, author)
else:
return self.overdrive_get_record(br, log, q, ovrdrv_id)
return self.overdrive_get_record(br, log, q, ovrdrv_id)
@ -461,10 +469,10 @@ if __name__ == '__main__':
[
(
{'title':'Foundation and Earth',
'authors':['Asimov']},
[title_test('Foundation and Earth', exact=True),
authors_test(['Isaac Asimov'])]
{'title':'The Sea Kings Daughter',
'authors':['Elizabeth Peters']},
[title_test('The Sea Kings Daughter', exact=False),
authors_test(['Elizabeth Peters'])]
),
(

View File

@ -48,7 +48,8 @@ def merge_result(oldmi, newmi, ensure_fields=None):
return newmi
def main(do_identify, covers, metadata, ensure_fields):
def main(do_identify, covers, metadata, ensure_fields, tdir):
os.chdir(tdir)
failed_ids = set()
failed_covers = set()
all_failed = True
@ -103,7 +104,8 @@ def single_identify(title, authors, identifiers):
return [metadata_to_opf(r) for r in results], [r.has_cached_cover_url for
r in results], dump_caches(), log.dump()
def single_covers(title, authors, identifiers, caches):
def single_covers(title, authors, identifiers, caches, tdir):
os.chdir(tdir)
load_caches(caches)
log = GUILog()
results = Queue()

View File

@ -295,21 +295,21 @@ class MOBIHeader(object): # {{{
self.datp_record_count, = struct.unpack(b'>I', self.raw[124:128])
self.exth_flags, = struct.unpack(b'>I', self.raw[128:132])
self.has_exth = bool(self.exth_flags & 0x40)
self.has_drm_data = self.length >= 174 and len(self.raw) >= 180
self.has_drm_data = self.length >= 174 and len(self.raw) >= 184
if self.has_drm_data:
self.unknown3 = self.raw[132:164]
self.drm_offset, = struct.unpack(b'>I', self.raw[164:168])
self.drm_count, = struct.unpack(b'>I', self.raw[168:172])
self.drm_size, = struct.unpack(b'>I', self.raw[172:176])
self.drm_flags = bin(struct.unpack(b'>I', self.raw[176:180])[0])
self.unknown3 = self.raw[132:168]
self.drm_offset, self.drm_count, self.drm_size, self.drm_flags = \
struct.unpack(b'>4I', self.raw[168:184])
self.has_extra_data_flags = self.length >= 232 and len(self.raw) >= 232+16
self.has_fcis_flis = False
self.has_multibytes = self.has_indexing_bytes = self.has_uncrossable_breaks = False
self.extra_data_flags = 0
if self.has_extra_data_flags:
self.unknown4 = self.raw[180:192]
self.fdst_idx, self.fdst_count = struct.unpack_from(b'>II',
self.unknown4 = self.raw[184:192]
self.fdst_idx, self.fdst_count = struct.unpack_from(b'>LL',
self.raw, 192)
if self.fdst_count <= 1:
self.fdst_idx = NULL_INDEX
(self.fcis_number, self.fcis_count, self.flis_number,
self.flis_count) = struct.unpack(b'>IIII',
self.raw[200:216])
@ -327,7 +327,7 @@ class MOBIHeader(object): # {{{
self.primary_index_record, = struct.unpack(b'>I',
self.raw[244:248])
if self.file_version >= 8:
if self.length >= 248:
(self.sect_idx, self.skel_idx, self.datp_idx, self.oth_idx
) = struct.unpack_from(b'>4L', self.raw, 248)
self.unknown9 = self.raw[264:self.length]
@ -337,12 +337,13 @@ class MOBIHeader(object): # {{{
# The following are all relative to the position of the header record
# make them absolute for ease of debugging
for x in ('sect_idx', 'skel_idx', 'datp_idx', 'oth_idx',
self.relative_records = {'sect_idx', 'skel_idx', 'datp_idx', 'oth_idx',
'meta_orth_indx', 'huffman_record_offset',
'first_non_book_record', 'datp_record_offset', 'fcis_number',
'flis_number', 'primary_index_record', 'fdst_idx',
'first_image_index'):
if hasattr(self, x):
'first_image_index'}
for x in self.relative_records:
if hasattr(self, x) and getattr(self, x) != NULL_INDEX:
setattr(self, x, self.header_offset+getattr(self, x))
if self.has_exth:
@ -355,70 +356,79 @@ class MOBIHeader(object): # {{{
def __str__(self):
ans = ['*'*20 + ' MOBI %d Header '%self.file_version+ '*'*20]
a = ans.append
i = lambda d, x : a('%s (null value: %d): %d'%(d, NULL_INDEX, x))
ans.append('Compression: %s'%self.compression)
ans.append('Unused: %r'%self.unused)
ans.append('Number of text records: %d'%self.number_of_text_records)
ans.append('Text record size: %d'%self.text_record_size)
ans.append('Encryption: %s'%self.encryption_type)
ans.append('Unknown: %r'%self.unknown)
ans.append('Identifier: %r'%self.identifier)
ans.append('Header length: %d'% self.length)
ans.append('Type: %s'%self.type)
ans.append('Encoding: %s'%self.encoding)
ans.append('UID: %r'%self.uid)
ans.append('File version: %d'%self.file_version)
i('Meta Orth Index (Sections index in KF8)', self.meta_orth_indx)
i('Meta Infl Index', self.meta_infl_indx)
ans.append('Secondary index record: %d (null val: %d)'%(
self.secondary_index_record, NULL_INDEX))
ans.append('Reserved: %r'%self.reserved)
ans.append('First non-book record (null value: %d): %d'%(NULL_INDEX,
self.first_non_book_record))
ans.append('Full name offset: %d'%self.fullname_offset)
ans.append('Full name length: %d bytes'%self.fullname_length)
ans.append('Langcode: %r'%self.locale_raw)
ans.append('Language: %s'%self.language)
ans.append('Sub language: %s'%self.sublanguage)
ans.append('Input language: %r'%self.input_language)
ans.append('Output language: %r'%self.output_langauage)
ans.append('Min version: %d'%self.min_version)
ans.append('First Image index: %d'%self.first_image_index)
ans.append('Huffman record offset: %d'%self.huffman_record_offset)
ans.append('Huffman record count: %d'%self.huffman_record_count)
ans.append('DATP record offset: %r'%self.datp_record_offset)
ans.append('DATP record count: %r'%self.datp_record_count)
ans.append('EXTH flags: %s (%s)'%(bin(self.exth_flags)[2:], self.has_exth))
def i(d, x):
x = 'NULL' if x == NULL_INDEX else x
a('%s: %s'%(d, x))
def r(d, attr):
x = getattr(self, attr)
if attr in self.relative_records and x != NULL_INDEX:
a('%s: Absolute: %d Relative: %d'%(d, x, x-self.header_offset))
else:
i(d, x)
a('Compression: %s'%self.compression)
a('Unused: %r'%self.unused)
a('Number of text records: %d'%self.number_of_text_records)
a('Text record size: %d'%self.text_record_size)
a('Encryption: %s'%self.encryption_type)
a('Unknown: %r'%self.unknown)
a('Identifier: %r'%self.identifier)
a('Header length: %d'% self.length)
a('Type: %s'%self.type)
a('Encoding: %s'%self.encoding)
a('UID: %r'%self.uid)
a('File version: %d'%self.file_version)
r('Meta Orth Index', 'meta_orth_indx')
r('Meta Infl Index', 'meta_infl_indx')
r('Secondary index record', 'secondary_index_record')
a('Reserved: %r'%self.reserved)
r('First non-book record', 'first_non_book_record')
a('Full name offset: %d'%self.fullname_offset)
a('Full name length: %d bytes'%self.fullname_length)
a('Langcode: %r'%self.locale_raw)
a('Language: %s'%self.language)
a('Sub language: %s'%self.sublanguage)
a('Input language: %r'%self.input_language)
a('Output language: %r'%self.output_langauage)
a('Min version: %d'%self.min_version)
r('First Image index', 'first_image_index')
r('Huffman record offset', 'huffman_record_offset')
a('Huffman record count: %d'%self.huffman_record_count)
r('DATP record offset', 'datp_record_offset')
a('DATP record count: %r'%self.datp_record_count)
a('EXTH flags: %s (%s)'%(bin(self.exth_flags)[2:], self.has_exth))
if self.has_drm_data:
ans.append('Unknown3: %r'%self.unknown3)
ans.append('DRM Offset: %s'%self.drm_offset)
ans.append('DRM Count: %s'%self.drm_count)
ans.append('DRM Size: %s'%self.drm_size)
ans.append('DRM Flags: %r'%self.drm_flags)
a('Unknown3: %r'%self.unknown3)
r('DRM Offset', 'drm_offset')
a('DRM Count: %s'%self.drm_count)
a('DRM Size: %s'%self.drm_size)
a('DRM Flags: %r'%self.drm_flags)
if self.has_extra_data_flags:
ans.append('Unknown4: %r'%self.unknown4)
ans.append('FDST Index: %d'% self.fdst_idx)
ans.append('FDST Count: %d'% self.fdst_count)
ans.append('FCIS number: %d'% self.fcis_number)
ans.append('FCIS count: %d'% self.fcis_count)
ans.append('FLIS number: %d'% self.flis_number)
ans.append('FLIS count: %d'% self.flis_count)
ans.append('Unknown6: %r'% self.unknown6)
ans.append('SRCS record index: %d'%self.srcs_record_index)
ans.append('Number of SRCS records?: %d'%self.num_srcs_records)
ans.append('Unknown7: %r'%self.unknown7)
ans.append(('Extra data flags: %s (has multibyte: %s) '
a('Unknown4: %r'%self.unknown4)
r('FDST Index', 'fdst_idx')
a('FDST Count: %d'% self.fdst_count)
r('FCIS number', 'fcis_number')
a('FCIS count: %d'% self.fcis_count)
r('FLIS number', 'flis_number')
a('FLIS count: %d'% self.flis_count)
a('Unknown6: %r'% self.unknown6)
r('SRCS record index', 'srcs_record_index')
a('Number of SRCS records?: %d'%self.num_srcs_records)
a('Unknown7: %r'%self.unknown7)
a(('Extra data flags: %s (has multibyte: %s) '
'(has indexing: %s) (has uncrossable breaks: %s)')%(
bin(self.extra_data_flags), self.has_multibytes,
self.has_indexing_bytes, self.has_uncrossable_breaks ))
ans.append('Primary index record (null value: %d): %d'%(NULL_INDEX,
self.primary_index_record))
if self.file_version >= 8:
i('Sections Index', self.sect_idx)
i('SKEL Index', self.skel_idx)
i('DATP Index', self.datp_idx)
i('Other Index', self.oth_idx)
r('NCX index', 'primary_index_record')
if self.length >= 248:
r('Sections Index', 'sect_idx')
r('SKEL Index', 'skel_idx')
r('DATP Index', 'datp_idx')
r('Other Index', 'oth_idx')
if self.unknown9:
a('Unknown9: %r'%self.unknown9)

View File

@ -0,0 +1,185 @@
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import (unicode_literals, division, absolute_import,
print_function)
__license__ = 'GPL v3'
__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
from collections import OrderedDict, namedtuple
from calibre.ebooks.mobi.reader.headers import NULL_INDEX
from calibre.ebooks.mobi.reader.index import (CNCX, parse_indx_header,
parse_tagx_section, parse_index_record, INDEX_HEADER_FIELDS)
from calibre.ebooks.mobi.reader.ncx import (tag_fieldname_map, default_entry)
File = namedtuple('File',
'file_number name divtbl_count start_position length')
Elem = namedtuple('Chunk',
'insert_pos toc_text file_number sequence_number start_pos '
'length')
GuideRef = namedtuple('GuideRef', 'type title pos_fid')
def read_index(sections, idx, codec):
table, cncx = OrderedDict(), CNCX([], codec)
data = sections[idx].raw
indx_header = parse_indx_header(data)
indx_count = indx_header['count']
if indx_header['ncncx'] > 0:
off = idx + indx_count + 1
cncx_records = [x.raw for x in sections[off:off+indx_header['ncncx']]]
cncx = CNCX(cncx_records, codec)
tag_section_start = indx_header['tagx']
control_byte_count, tags = parse_tagx_section(data[tag_section_start:])
for i in xrange(idx + 1, idx + 1 + indx_count):
# Index record
data = sections[i].raw
parse_index_record(table, data, control_byte_count, tags, codec,
indx_header['ordt_map'], strict=True)
return table, cncx, indx_header
class Index(object):
def __init__(self, idx, records, codec):
self.table = self.cncx = self.header = self.records = None
if idx != NULL_INDEX:
self.table, self.cncx, self.header = read_index(records, idx, codec)
def render(self):
ans = ['*'*10 + ' Index Header ' + '*'*10]
a = ans.append
if self.header is not None:
for field in INDEX_HEADER_FIELDS:
a('%-12s: %r'%(field, self.header[field]))
ans.extend(['', ''])
if self.cncx:
a('*'*10 + ' CNCX ' + '*'*10)
for offset, val in self.cncx.iteritems():
a('%10s: %s'%(offset, val))
ans.extend(['', ''])
if self.table is not None:
a('*'*10 + ' %d Index Entries '%len(self.table) + '*'*10)
for k, v in self.table.iteritems():
a('%s: %r'%(k, v))
if self.records:
ans.extend(['', '', '*'*10 + ' Parsed Entries ' + '*'*10])
for f in self.records:
a(repr(f))
return ans + ['']
def __str__(self):
return '\n'.join(self.render())
def __iter__(self):
return iter(self.records)
class SKELIndex(Index):
def __init__(self, skelidx, records, codec):
super(SKELIndex, self).__init__(skelidx, records, codec)
self.records = []
if self.table is not None:
for i, text in enumerate(self.table.iterkeys()):
tag_map = self.table[text]
if set(tag_map.iterkeys()) != {1, 6}:
raise ValueError('SKEL Index has unknown tags: %s'%
(set(tag_map.iterkeys())-{1,6}))
self.records.append(File(
i, # file_number
text, # name
tag_map[1][0], # divtbl_count
tag_map[6][0], # start_pos
tag_map[6][1]) # length
)
class SECTIndex(Index):
def __init__(self, sectidx, records, codec):
super(SECTIndex, self).__init__(sectidx, records, codec)
self.records = []
if self.table is not None:
for i, text in enumerate(self.table.iterkeys()):
tag_map = self.table[text]
if set(tag_map.iterkeys()) != {2, 3, 4, 6}:
raise ValueError('Chunk Index has unknown tags: %s'%
(set(tag_map.iterkeys())-{2, 3, 4, 6}))
toc_text = self.cncx[tag_map[2][0]]
self.records.append(Elem(
int(text), # insert_pos
toc_text, # toc_text
tag_map[3][0], # file_number
tag_map[4][0], # sequence_number
tag_map[6][0], # start_pos
tag_map[6][1] # length
)
)
class GuideIndex(Index):
def __init__(self, guideidx, records, codec):
super(GuideIndex, self).__init__(guideidx, records, codec)
self.records = []
if self.table is not None:
for i, text in enumerate(self.table.iterkeys()):
tag_map = self.table[text]
if set(tag_map.iterkeys()) not in ({1, 6}, {1, 2, 3}):
raise ValueError('Guide Index has unknown tags: %s'%
tag_map)
title = self.cncx[tag_map[1][0]]
self.records.append(GuideRef(
text,
title,
tag_map[6] if 6 in tag_map else (tag_map[2], tag_map[3])
)
)
class NCXIndex(Index):
def __init__(self, ncxidx, records, codec):
super(NCXIndex, self).__init__(ncxidx, records, codec)
self.records = []
if self.table is not None:
for num, x in enumerate(self.table.iteritems()):
text, tag_map = x
entry = default_entry.copy()
entry['name'] = text
entry['num'] = num
for tag in tag_fieldname_map.iterkeys():
fieldname, i = tag_fieldname_map[tag]
if tag in tag_map:
fieldvalue = tag_map[tag][i]
if tag == 6:
# Appears to be an idx into the KF8 elems table with an
# offset
fieldvalue = tuple(tag_map[tag])
entry[fieldname] = fieldvalue
for which, name in {3:'text', 5:'kind', 70:'description',
71:'author', 72:'image_caption',
73:'image_attribution'}.iteritems():
if tag == which:
entry[name] = self.cncx.get(fieldvalue,
default_entry[name])
self.records.append(entry)

View File

@ -10,8 +10,11 @@ __docformat__ = 'restructuredtext en'
import sys, os, imghdr, struct
from itertools import izip
from calibre import CurrentDir
from calibre.ebooks.mobi.debug.headers import TextRecord
from calibre.ebooks.mobi.utils import read_font_record
from calibre.ebooks.mobi.debug.index import (SKELIndex, SECTIndex, NCXIndex,
GuideIndex)
from calibre.ebooks.mobi.utils import read_font_record, decode_tbs
from calibre.ebooks.mobi.debug import format_bytes
from calibre.ebooks.mobi.reader.headers import NULL_INDEX
@ -42,6 +45,24 @@ class FDST(object):
return '\n'.join(ans)
class File(object):
def __init__(self, skel, skeleton, text, first_aid, sections):
self.name = 'part%04d'%skel.file_number
self.skeleton, self.text, self.first_aid = skeleton, text, first_aid
self.sections = sections
def dump(self, ddir):
with open(os.path.join(ddir, self.name + '.html'), 'wb') as f:
f.write(self.text)
base = os.path.join(ddir, self.name + '-parts')
os.mkdir(base)
with CurrentDir(base):
with open('skeleton.html', 'wb') as f:
f.write(self.skeleton)
for i, text in enumerate(self.sections):
with open('sect-%04d.html'%i, 'wb') as f:
f.write(text)
class MOBIFile(object):
@ -65,6 +86,9 @@ class MOBIFile(object):
self.header = self.mf.mobi8_header
self.extract_resources()
self.read_fdst()
self.read_indices()
self.build_files()
self.read_tbs()
def print_header(self, f=sys.stdout):
print (str(self.mf.palmdb).encode('utf-8'), file=f)
@ -85,6 +109,45 @@ class MOBIFile(object):
if self.fdst.num_sections != self.header.fdst_count:
raise ValueError('KF8 Header contains invalid FDST count')
def read_indices(self):
self.skel_index = SKELIndex(self.header.skel_idx, self.mf.records,
self.header.encoding)
self.sect_index = SECTIndex(self.header.sect_idx, self.mf.records,
self.header.encoding)
self.ncx_index = NCXIndex(self.header.primary_index_record,
self.mf.records, self.header.encoding)
self.guide_index = GuideIndex(self.header.oth_idx, self.mf.records,
self.header.encoding)
def build_files(self):
text = self.raw_text
self.files = []
for skel in self.skel_index.records:
sects = [x for x in self.sect_index.records if x.file_number
== skel.file_number]
skeleton = text[skel.start_position:skel.start_position+skel.length]
ftext = skeleton
first_aid = sects[0].toc_text
sections = []
for sect in sects:
start_pos = skel.start_position + skel.length + sect.start_pos
sect_text = text[start_pos:start_pos+sect.length]
insert_pos = sect.insert_pos - skel.start_position
ftext = ftext[:insert_pos] + sect_text + ftext[insert_pos:]
sections.append(sect_text)
self.files.append(File(skel, skeleton, ftext, first_aid, sections))
def dump_flows(self, ddir):
if self.fdst is None:
raise ValueError('This MOBI file has no FDST record')
for i, x in enumerate(self.fdst.sections):
start, end = x
raw = self.raw_text[start:end]
with open(os.path.join(ddir, 'flow%04d.txt'%i), 'wb') as f:
f.write(raw)
def extract_resources(self):
self.resource_map = []
known_types = {b'FLIS', b'FCIS', b'SRCS',
@ -121,6 +184,54 @@ class MOBIFile(object):
self.resource_map.append(('%s/%06d%s.%s'%(prefix, i, suffix, ext),
payload))
def read_tbs(self):
from calibre.ebooks.mobi.writer8.tbs import (Entry,
collect_indexing_data)
entry_map = []
for index in self.ncx_index:
enders = [e['pos'] for e in self.ncx_index if e['pos'] >
index['pos'] and
e['hlvl'] <= index['hlvl']]
end = min(enders+[len(self.raw_text)])
entry_map.append(Entry(index=index['num'], title=index['text'],
depth=index['hlvl'],
parent=index['parent'] if index['parent'] > -1 else None,
first_child=index['child1'] if index['child1'] > -1 else None,
last_child=index['childn'] if index['childn'] > -1 else None,
start=index['pos'], length=end-index['pos']))
indexing_data = collect_indexing_data(entry_map,
len(self.text_records))
self.indexing_data = []
for i, data in enumerate(indexing_data):
rec = self.text_records[i]
tbs_bytes = rec.trailing_data.get('indexing', b'')
desc = ['Record #%d'%i]
for x in ('starts', 'completes', 'ends', 'spans'):
points = ['\t%d at depth: %d'%(e.index, e.depth) for e in
getattr(data, x)]
if points:
desc.append(x+':')
desc.extend(points)
desc.append('TBS Bytes: ' + format_bytes(tbs_bytes))
flag_sz = 3
sequences = []
while tbs_bytes:
try:
val, extra, consumed = decode_tbs(tbs_bytes, flag_size=flag_sz)
except:
break
flag_sz = 4
tbs_bytes = tbs_bytes[consumed:]
extra = {bin(k):v for k, v in extra.iteritems()}
sequences.append((val, extra))
for i, seq in enumerate(sequences):
desc.append('Sequence #%d: %r %r'%(i, seq[0], seq[1]))
if tbs_bytes:
desc.append('Remaining bytes: %s'%format_bytes(tbs_bytes))
desc.append('')
self.indexing_data.append('\n'.join(desc))
def inspect_mobi(mobi_file, ddir):
f = MOBIFile(mobi_file)
@ -131,7 +242,8 @@ def inspect_mobi(mobi_file, ddir):
with open(alltext, 'wb') as of:
of.write(f.raw_text)
for x in ('text_records', 'images', 'fonts', 'binary'):
for x in ('text_records', 'images', 'fonts', 'binary', 'files', 'flows',
'tbs'):
os.mkdir(os.path.join(ddir, x))
for rec in f.text_records:
@ -145,3 +257,24 @@ def inspect_mobi(mobi_file, ddir):
with open(os.path.join(ddir, 'fdst.record'), 'wb') as fo:
fo.write(str(f.fdst).encode('utf-8'))
with open(os.path.join(ddir, 'skel.record'), 'wb') as fo:
fo.write(str(f.skel_index).encode('utf-8'))
with open(os.path.join(ddir, 'chunks.record'), 'wb') as fo:
fo.write(str(f.sect_index).encode('utf-8'))
with open(os.path.join(ddir, 'ncx.record'), 'wb') as fo:
fo.write(str(f.ncx_index).encode('utf-8'))
with open(os.path.join(ddir, 'guide.record'), 'wb') as fo:
fo.write(str(f.guide_index).encode('utf-8'))
with open(os.path.join(ddir, 'tbs', 'all.txt'), 'wb') as fo:
fo.write(('\n'.join(f.indexing_data)).encode('utf-8'))
for part in f.files:
part.dump(os.path.join(ddir, 'files'))
f.dump_flows(os.path.join(ddir, 'flows'))

View File

@ -10,7 +10,7 @@ import copy
import re
from lxml import etree
from calibre.ebooks.oeb.base import namespace, barename
from calibre.ebooks.oeb.base import XHTML, XHTML_NS, OEB_DOCS, urlnormalize
from calibre.ebooks.oeb.base import XHTML, XHTML_NS, urlnormalize
from calibre.ebooks.oeb.stylizer import Stylizer
from calibre.ebooks.oeb.transforms.flatcss import KeyMapper
from calibre.utils.magick.draw import identify_data
@ -109,26 +109,8 @@ class MobiMLizer(object):
self.profile = profile = context.dest
self.fnums = fnums = dict((v, k) for k, v in profile.fnums.items())
self.fmap = KeyMapper(profile.fbase, profile.fbase, fnums.keys())
self.remove_html_cover()
self.mobimlize_spine()
def remove_html_cover(self):
oeb = self.oeb
if not oeb.metadata.cover \
or 'cover' not in oeb.guide:
return
href = oeb.guide['cover'].href
del oeb.guide['cover']
item = oeb.manifest.hrefs[href]
if item.spine_position is not None:
self.log.warn('Found an HTML cover,', item.href, 'removing it.',
'If you find some content missing from the output MOBI, it '
'is because you misidentified the HTML cover in the input '
'document')
oeb.spine.remove(item)
if item.media_type in OEB_DOCS:
self.oeb.manifest.remove(item)
def mobimlize_spine(self):
'Iterate over the spine and convert it to MOBIML'
for item in self.oeb.spine:
@ -473,7 +455,7 @@ class MobiMLizer(object):
if tag in TABLE_TAGS and self.ignore_tables:
tag = 'span' if tag == 'td' else 'div'
if tag == 'table':
if tag in ('table', 'td', 'tr'):
col = style.backgroundColor
if col:
elem.set('bgcolor', col)

View File

@ -111,6 +111,13 @@ class CNCX(object): # {{{
def get(self, offset, default=None):
return self.records.get(offset, default)
def __bool__(self):
return bool(self.records)
__nonzero__ = __bool__
def iteritems(self):
return self.records.iteritems()
# }}}
def parse_tagx_section(data):

View File

@ -223,15 +223,15 @@ def insert_images_into_markup(parts, resource_map, log):
# Handle any embedded raster images links in the xhtml text
# kindle:embed:XXXX?mime=image/gif (png, jpeg, etc) (used for images)
img_pattern = re.compile(r'''(<[img\s|image\s][^>]*>)''', re.IGNORECASE)
img_index_pattern = re.compile(r'''['"]kindle:embed:([0-9|A-V]+)[^'"]*['"]''')
img_index_pattern = re.compile(r'''[('"]kindle:embed:([0-9|A-V]+)[^')"]*[)'"]''')
style_pattern = re.compile(r'''(<[a-zA-Z0-9]+\s[^>]*style\s*=\s*[^>]*>)''',
re.IGNORECASE)
for i in xrange(len(parts)):
part = parts[i]
#[partnum, dir, filename, beg, end, aidtext] = self.k8proc.partinfo[i]
# links to raster image files
# image_pattern
srcpieces = img_pattern.split(part)
for j in range(1, len(srcpieces), 2):
for j in xrange(1, len(srcpieces), 2):
tag = srcpieces[j]
if tag.startswith('<im'):
for m in img_index_pattern.finditer(tag):
@ -248,6 +248,30 @@ def insert_images_into_markup(parts, resource_map, log):
# store away modified version
parts[i] = part
# Replace urls used in style attributes
for i in xrange(len(parts)):
part = parts[i]
srcpieces = style_pattern.split(part)
for j in xrange(1, len(srcpieces), 2):
tag = srcpieces[j]
if 'kindle:embed' in tag:
for m in img_index_pattern.finditer(tag):
num = int(m.group(1), 32)
href = resource_map[num-1]
osep = m.group()[0]
csep = m.group()[-1]
if href:
replacement = '%s%s%s'%(osep, '../' + href, csep)
tag = img_index_pattern.sub(replacement, tag, 1)
else:
log.warn('Referenced image %s was not recognized as '
'a valid image in %s' % (num, tag))
srcpieces[j] = tag
part = "".join(srcpieces)
# store away modified version
parts[i] = part
def upshift_markup(parts):
tag_pattern = re.compile(r'''(<(?:svg)[^>]*>)''', re.IGNORECASE)

View File

@ -109,7 +109,7 @@ class Mobi8Reader(object):
table, cncx = read_index(self.kf8_sections, self.header.othidx,
self.header.codec)
Item = namedtuple('Item',
'type title div_frag_num')
'type title pos_fid')
for i, ref_type in enumerate(table.iterkeys()):
tag_map = table[ref_type]
@ -119,7 +119,7 @@ class Mobi8Reader(object):
if 3 in tag_map.keys():
fileno = tag_map[3][0]
if 6 in tag_map.keys():
fileno = tag_map[6][0]
fileno = tag_map[6]
self.guide.append(Item(ref_type.decode(self.header.codec),
title, fileno))
@ -287,23 +287,24 @@ class Mobi8Reader(object):
def create_guide(self):
guide = Guide()
for ref_type, ref_title, fileno in self.guide:
has_start = False
for ref_type, ref_title, pos_fid in self.guide:
try:
elem = self.elems[fileno]
except IndexError:
# Happens for thumbnailstandard in Amazon book samples
continue
fi = self.get_file_info(elem.insert_pos)
idtext = self.get_id_tag(elem.insert_pos).decode(self.header.codec)
linktgt = fi.filename
if len(pos_fid) != 2:
continue
except TypeError:
continue # thumbnailstandard record, ignore it
linktgt, idtext = self.get_id_tag_by_pos_fid(*pos_fid)
if idtext:
linktgt += b'#' + idtext
g = Guide.Reference('%s/%s'%(fi.type, linktgt), os.getcwdu())
g = Guide.Reference(linktgt, os.getcwdu())
g.title, g.type = ref_title, ref_type
if g.title == 'start' or g.type == 'text':
has_start = True
guide.append(g)
so = self.header.exth.start_offset
if so not in {None, NULL_INDEX}:
if so not in {None, NULL_INDEX} and not has_start:
fi = self.get_file_info(so)
if fi.filename is not None:
idtext = self.get_id_tag(so).decode(self.header.codec)

View File

@ -7,13 +7,15 @@ __license__ = 'GPL v3'
__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import struct, string, imghdr, zlib
import struct, string, imghdr, zlib, os
from collections import OrderedDict
from io import BytesIO
from calibre.utils.magick.draw import Image, save_cover_data_to, thumbnail
from calibre.ebooks import normalize
IMAGE_MAX_SIZE = 10 * 1024 * 1024
RECORD_SIZE = 0x1000 # 4096 (Text record size (uncompressed))
def decode_string(raw, codec='utf-8', ordt_map=''):
length, = struct.unpack(b'>B', raw[0])
@ -364,15 +366,17 @@ def count_set_bits(num):
num >>= 1
return ans
def to_base(num, base=32):
def to_base(num, base=32, min_num_digits=None):
digits = string.digits + string.ascii_uppercase
sign = 1 if num >= 0 else -1
if num == 0: return '0'
if num == 0: return ('0' if min_num_digits is None else '0'*min_num_digits)
num *= sign
ans = []
while num:
ans.append(digits[(num % base)])
num //= base
if min_num_digits is not None and len(ans) < min_num_digits:
ans.extend('0'*(min_num_digits - len(ans)))
if sign < 0:
ans.append('-')
ans.reverse()
@ -388,27 +392,8 @@ def mobify_image(data):
data = im.export('gif')
return data
def read_zlib_header(header):
header = bytearray(header)
# See sec 2.2 of RFC 1950 for the zlib stream format
# http://www.ietf.org/rfc/rfc1950.txt
if (header[0]*256 + header[1])%31 != 0:
return None, 'Bad zlib header, FCHECK failed'
cmf = header[0] & 0b1111
cinfo = header[0] >> 4
if cmf != 8:
return None, 'Unknown zlib compression method: %d'%cmf
if cinfo > 7:
return None, 'Invalid CINFO field in zlib header: %d'%cinfo
fdict = (header[1]&0b10000)>>5
if fdict != 0:
return None, 'FDICT based zlib compression not supported'
wbits = cinfo + 8
return wbits, None
def read_font_record(data, extent=1040): # {{{
# Font records {{{
def read_font_record(data, extent=1040):
'''
Return the font encoded in the MOBI FONT record represented by data.
The return value in a dict with fields raw_data, font_data, err, ext,
@ -466,15 +451,8 @@ def read_font_record(data, extent=1040): # {{{
if flags & 0b1:
# ZLIB compressed data
wbits, err = read_zlib_header(font_data[:2])
if err is not None:
ans['err'] = err
return ans
adler32, = struct.unpack_from(b'>I', font_data, len(font_data) - 4)
try:
# remove two bytes of zlib header and 4 bytes of trailing checksum
# negative wbits indicates no standard gzip header
font_data = zlib.decompress(font_data[2:-4], -wbits, usize)
font_data = zlib.decompress(font_data)
except Exception as e:
ans['err'] = 'Failed to zlib decompress font data (%s)'%e
return ans
@ -483,23 +461,146 @@ def read_font_record(data, extent=1040): # {{{
ans['err'] = 'Uncompressed font size mismatch'
return ans
if False:
# For some reason these almost never match, probably Amazon has a
# buggy Adler32 implementation
sig = (zlib.adler32(font_data) & 0xffffffff)
if sig != adler32:
ans['err'] = ('Adler checksum did not match. Stored: %d '
'Calculated: %d')%(adler32, sig)
return ans
ans['font_data'] = font_data
sig = font_data[:4]
ans['ext'] = ('ttf' if sig in {b'\0\1\0\0', b'true', b'ttcf'}
else 'otf' if sig == b'OTTO' else 'dat')
return ans
def write_font_record(data, obfuscate=True, compress=True):
'''
Write the ttf/otf font represented by data into a font record. See
read_font_record() for details on the format of the record.
'''
flags = 0
key_len = 20
usize = len(data)
xor_key = b''
if compress:
flags |= 0b1
data = zlib.compress(data, 9)
if obfuscate:
flags |= 0b10
xor_key = os.urandom(key_len)
key = bytearray(xor_key)
data = bytearray(data)
for i in xrange(1040):
data[i] ^= key[i%key_len]
data = bytes(data)
key_start = struct.calcsize(b'>5L') + 4
data_start = key_start + len(xor_key)
header = b'FONT' + struct.pack(b'>5L', usize, flags, data_start,
len(xor_key), key_start)
return header + xor_key + data
# }}}
def create_text_record(text):
'''
Return a Palmdoc record of size RECORD_SIZE from the text file object.
In case the record ends in the middle of a multibyte character return
the overlap as well.
Returns data, overlap: where both are byte strings. overlap is the
extra bytes needed to complete the truncated multibyte character.
'''
opos = text.tell()
text.seek(0, 2)
# npos is the position of the next record
npos = min((opos + RECORD_SIZE, text.tell()))
# Number of bytes from the next record needed to complete the last
# character in this record
extra = 0
last = b''
while not last.decode('utf-8', 'ignore'):
# last contains no valid utf-8 characters
size = len(last) + 1
text.seek(npos - size)
last = text.read(size)
# last now has one valid utf-8 char and possibly some bytes that belong
# to a truncated char
try:
last.decode('utf-8', 'strict')
except UnicodeDecodeError:
# There are some truncated bytes in last
prev = len(last)
while True:
text.seek(npos - prev)
last = text.read(len(last) + 1)
try:
last.decode('utf-8')
except UnicodeDecodeError:
pass
else:
break
extra = len(last) - prev
text.seek(opos)
data = text.read(RECORD_SIZE)
overlap = text.read(extra)
text.seek(npos)
return data, overlap
class CNCX(object): # {{{
'''
Create the CNCX records. These are records containing all the strings from
an index. Each record is of the form: <vwi string size><utf-8 encoded
string>
'''
MAX_STRING_LENGTH = 500
def __init__(self, strings=()):
self.strings = OrderedDict((s, 0) for s in strings)
self.records = []
offset = 0
buf = BytesIO()
for key in tuple(self.strings.iterkeys()):
utf8 = utf8_text(key[:self.MAX_STRING_LENGTH])
l = len(utf8)
sz_bytes = encint(l)
raw = sz_bytes + utf8
if 0xfbf8 - buf.tell() < 6 + len(raw):
# Records in PDB files cannot be larger than 0x10000, so we
# stop well before that.
pad = 0xfbf8 - buf.tell()
buf.write(b'\0' * pad)
self.records.append(buf.getvalue())
buf.seek(0), buf.truncate(0)
offset = len(self.records) * 0x10000
buf.write(raw)
self.strings[key] = offset
offset += len(raw)
val = buf.getvalue()
if val:
self.records.append(align_block(val))
def __getitem__(self, string):
return self.strings[string]
def __bool__(self):
return bool(self.records)
__nonzero__ = __bool__
def __len__(self):
return len(self.records)
# }}}
def is_guide_ref_start(ref):
return (ref.title.lower() == 'start' or
(ref.type and ref.type.lower() in {'start',
'other.start', 'text'}))

View File

@ -12,5 +12,4 @@ UNCOMPRESSED = 1
PALMDOC = 2
HUFFDIC = 17480
PALM_MAX_IMAGE_SIZE = 63 * 1024
RECORD_SIZE = 0x1000 # 4096 (Text record size (uncompressed))

View File

@ -12,56 +12,22 @@ from struct import pack
from cStringIO import StringIO
from collections import OrderedDict, defaultdict
from calibre.ebooks.mobi.writer2 import RECORD_SIZE
from calibre.ebooks.mobi.utils import (encint, encode_number_as_hex,
encode_tbs, align_block, utf8_text)
encode_tbs, align_block, RECORD_SIZE, CNCX as CNCX_)
class CNCX(object): # {{{
'''
Create the CNCX records. These are records containing all the strings from
the NCX. Each record is of the form: <vwi string size><utf-8 encoded
string>
'''
MAX_STRING_LENGTH = 500
class CNCX(CNCX_): # {{{
def __init__(self, toc, is_periodical):
self.strings = OrderedDict()
strings = []
for item in toc.iterdescendants(breadth_first=True):
self.strings[item.title] = 0
strings.append(item.title)
if is_periodical:
self.strings[item.klass] = 0
strings.append(item.klass)
if item.author:
self.strings[item.author] = 0
strings.append(item.author)
if item.description:
self.strings[item.description] = 0
self.records = []
offset = 0
buf = StringIO()
for key in tuple(self.strings.iterkeys()):
utf8 = utf8_text(key[:self.MAX_STRING_LENGTH])
l = len(utf8)
sz_bytes = encint(l)
raw = sz_bytes + utf8
if 0xfbf8 - buf.tell() < 6 + len(raw):
# Records in PDB files cannot be larger than 0x10000, so we
# stop well before that.
pad = 0xfbf8 - buf.tell()
buf.write(b'\0' * pad)
self.records.append(buf.getvalue())
buf.truncate(0)
offset = len(self.records) * 0x10000
buf.write(raw)
self.strings[key] = offset
offset += len(raw)
self.records.append(align_block(buf.getvalue()))
def __getitem__(self, string):
return self.strings[string]
strings.append(item.description)
CNCX_.__init__(self, strings)
# }}}
class TAGX(object): # {{{
@ -534,14 +500,14 @@ class Indexer(object): # {{{
# Write offsets to index entries as an IDXT block
idxt_block = b'IDXT'
buf.truncate(0)
buf.seek(0), buf.truncate(0)
for offset in offsets:
buf.write(pack(b'>H', header_length+offset))
idxt_block = align_block(idxt_block + buf.getvalue())
body = index_block + idxt_block
header = b'INDX'
buf.truncate(0)
buf.seek(0), buf.truncate(0)
buf.write(pack(b'>I', header_length))
buf.write(b'\0'*4) # Unknown
buf.write(pack(b'>I', 1)) # Header type? Or index record number?

View File

@ -7,51 +7,31 @@ __license__ = 'GPL v3'
__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import re, random, time
import random, time
from cStringIO import StringIO
from struct import pack
from calibre.ebooks import normalize, generate_masthead
from calibre.ebooks.oeb.base import OEB_RASTER_IMAGES
from calibre.ebooks import normalize
from calibre.ebooks.mobi.writer2.serializer import Serializer
from calibre.ebooks.compression.palmdoc import compress_doc
from calibre.ebooks.mobi.langcodes import iana2mobi
from calibre.utils.filenames import ascii_filename
from calibre.ebooks.mobi.writer2 import (PALMDOC, UNCOMPRESSED, RECORD_SIZE)
from calibre.ebooks.mobi.utils import (rescale_image, encint, mobify_image,
encode_trailing_data, align_block, detect_periodical)
from calibre.ebooks.mobi.writer2 import (PALMDOC, UNCOMPRESSED)
from calibre.ebooks.mobi.utils import (encint, encode_trailing_data,
align_block, detect_periodical, RECORD_SIZE, create_text_record)
from calibre.ebooks.mobi.writer2.indexer import Indexer
from calibre.ebooks.mobi import MAX_THUMB_DIMEN, MAX_THUMB_SIZE
EXTH_CODES = {
'creator': 100,
'publisher': 101,
'description': 103,
'identifier': 104,
'subject': 105,
'pubdate': 106,
'review': 107,
'contributor': 108,
'rights': 109,
'type': 111,
'source': 112,
'versionnumber': 114,
'startreading': 116,
'coveroffset': 201,
'thumboffset': 202,
'hasfakecover': 203,
'lastupdatetime': 502,
'title': 503,
}
# Disabled as I dont care about uncrossable breaks
WRITE_UNCROSSABLE_BREAKS = False
NULL_INDEX = 0xffffffff
class MobiWriter(object):
COLLAPSE_RE = re.compile(r'[ \t\r\n\v]+')
def __init__(self, opts, write_page_breaks_after_item=True):
def __init__(self, opts, resources, kf8, write_page_breaks_after_item=True):
self.opts = opts
self.resources = resources
self.kf8 = kf8
self.for_joint = kf8 is not None
self.write_page_breaks_after_item = write_page_breaks_after_item
self.compression = UNCOMPRESSED if opts.dont_compress else PALMDOC
self.prefer_author_sort = opts.prefer_author_sort
@ -83,7 +63,7 @@ class MobiWriter(object):
self.stream = stream
self.records = [None]
self.generate_content()
self.generate_record0()
self.generate_joint_record0() if self.for_joint else self.generate_record0()
self.write_header()
self.write_content()
@ -151,73 +131,19 @@ class MobiWriter(object):
# Images {{{
def generate_images(self):
oeb = self.oeb
oeb.logger.info('Serializing images...')
self.image_records = []
self.image_map = {}
self.masthead_offset = 0
index = 1
resources = self.resources
image_records = resources.records
self.image_map = resources.item_map
self.masthead_offset = resources.masthead_offset
self.cover_offset = resources.cover_offset
self.thumbnail_offset = resources.thumbnail_offset
mh_href = None
if 'masthead' in oeb.guide and oeb.guide['masthead'].href:
mh_href = oeb.guide['masthead'].href
self.image_records.append(None)
index += 1
elif self.is_periodical:
# Generate a default masthead
data = generate_masthead(unicode(self.oeb.metadata['title'][0]))
self.image_records.append(data)
index += 1
cover_href = self.cover_offset = self.thumbnail_offset = None
if (oeb.metadata.cover and
unicode(oeb.metadata.cover[0]) in oeb.manifest.ids):
cover_id = unicode(oeb.metadata.cover[0])
item = oeb.manifest.ids[cover_id]
cover_href = item.href
for item in self.oeb.manifest.values():
if item.media_type not in OEB_RASTER_IMAGES: continue
try:
data = item.data
if self.opts.mobi_keep_original_images:
data = mobify_image(data)
else:
data = rescale_image(data)
except:
oeb.logger.warn('Bad image file %r' % item.href)
continue
else:
if mh_href and item.href == mh_href:
self.image_records[0] = data
continue
self.image_records.append(data)
self.image_map[item.href] = index
index += 1
if cover_href and item.href == cover_href:
self.cover_offset = self.image_map[item.href] - 1
try:
data = rescale_image(item.data, dimen=MAX_THUMB_DIMEN,
maxsizeb=MAX_THUMB_SIZE)
except:
oeb.logger.warn('Failed to generate thumbnail')
else:
self.image_records.append(data)
self.thumbnail_offset = index - 1
index += 1
finally:
item.unload_data_from_memory()
if self.image_records and self.image_records[0] is None:
if image_records and image_records[0] is None:
raise ValueError('Failed to find masthead image in manifest')
# }}}
# Text {{{
def generate_text(self):
def generate_text(self): # {{{
self.oeb.logger.info('Serializing markup content...')
self.serializer = Serializer(self.oeb, self.image_map,
self.is_periodical,
@ -232,7 +158,7 @@ class MobiWriter(object):
self.oeb.logger.info(' Compressing markup content...')
while text.tell() < self.text_length:
data, overlap = self.read_text_record(text)
data, overlap = create_text_record(text)
if self.compression == PALMDOC:
data = compress_doc(data)
@ -249,57 +175,6 @@ class MobiWriter(object):
if records_size % 4 != 0:
self.records.append(b'\x00'*(records_size % 4))
self.first_non_text_record_idx += 1
def read_text_record(self, text):
'''
Return a Palmdoc record of size RECORD_SIZE from the text file object.
In case the record ends in the middle of a multibyte character return
the overlap as well.
Returns data, overlap: where both are byte strings. overlap is the
extra bytes needed to complete the truncated multibyte character.
'''
opos = text.tell()
text.seek(0, 2)
# npos is the position of the next record
npos = min((opos + RECORD_SIZE, text.tell()))
# Number of bytes from the next record needed to complete the last
# character in this record
extra = 0
last = b''
while not last.decode('utf-8', 'ignore'):
# last contains no valid utf-8 characters
size = len(last) + 1
text.seek(npos - size)
last = text.read(size)
# last now has one valid utf-8 char and possibly some bytes that belong
# to a truncated char
try:
last.decode('utf-8', 'strict')
except UnicodeDecodeError:
# There are some truncated bytes in last
prev = len(last)
while True:
text.seek(npos - prev)
last = text.read(len(last) + 1)
try:
last.decode('utf-8')
except UnicodeDecodeError:
pass
else:
break
extra = len(last) - prev
text.seek(opos)
data = text.read(RECORD_SIZE)
overlap = text.read(extra)
text.seek(npos)
return data, overlap
# }}}
def generate_record0(self): # MOBI header {{{
@ -315,11 +190,20 @@ class MobiWriter(object):
# header as well
bt = 0x103 if self.indexer.is_flat_periodical else 0x101
exth = self.build_exth(bt)
from calibre.ebooks.mobi.writer8.exth import build_exth
exth = build_exth(metadata,
prefer_author_sort=self.opts.prefer_author_sort,
is_periodical=self.is_periodical,
share_not_sync=self.opts.share_not_sync,
cover_offset=self.cover_offset,
thumbnail_offset=self.thumbnail_offset,
start_offset=self.serializer.start_offset, mobi_doctype=bt
)
first_image_record = None
if self.image_records:
if self.resources:
used_images = self.serializer.used_images
first_image_record = len(self.records)
self.records.extend(self.image_records)
self.resources.serialize(self.records, used_images)
last_content_record = len(self.records) - 1
# FCIS/FLIS (Seems to serve no purpose)
@ -481,125 +365,72 @@ class MobiWriter(object):
self.records[0] = align_block(record0)
# }}}
def build_exth(self, mobi_doctype): # EXTH Header {{{
oeb = self.oeb
exth = StringIO()
nrecs = 0
for term in oeb.metadata:
if term not in EXTH_CODES: continue
code = EXTH_CODES[term]
items = oeb.metadata[term]
if term == 'creator':
if self.prefer_author_sort:
creators = [normalize(unicode(c.file_as or c)) for c in
items][:1]
else:
creators = [normalize(unicode(c)) for c in items]
items = ['; '.join(creators)]
for item in items:
data = normalize(unicode(item))
if term != 'description':
data = self.COLLAPSE_RE.sub(' ', data)
if term == 'identifier':
if data.lower().startswith('urn:isbn:'):
data = data[9:]
elif item.scheme.lower() == 'isbn':
pass
else:
continue
data = data.encode('utf-8')
exth.write(pack(b'>II', code, len(data) + 8))
exth.write(data)
nrecs += 1
if term == 'rights' :
try:
rights = normalize(unicode(oeb.metadata.rights[0])).encode('utf-8')
except:
rights = b'Unknown'
exth.write(pack(b'>II', EXTH_CODES['rights'], len(rights) + 8))
exth.write(rights)
nrecs += 1
def generate_joint_record0(self): # {{{
from calibre.ebooks.mobi.writer8.mobi import (MOBIHeader,
HEADER_FIELDS)
from calibre.ebooks.mobi.writer8.exth import build_exth
# Write UUID as ASIN
uuid = None
from calibre.ebooks.oeb.base import OPF
for x in oeb.metadata['identifier']:
if (x.get(OPF('scheme'), None).lower() == 'uuid' or
unicode(x).startswith('urn:uuid:')):
uuid = unicode(x).split(':')[-1]
break
if uuid is None:
from uuid import uuid4
uuid = str(uuid4())
# Insert resource records
first_image_record = None
old = len(self.records)
if self.resources:
used_images = self.serializer.used_images | self.kf8.used_images
first_image_record = len(self.records)
self.resources.serialize(self.records, used_images)
resource_record_count = len(self.records) - old
if isinstance(uuid, unicode):
uuid = uuid.encode('utf-8')
if not self.opts.share_not_sync:
exth.write(pack(b'>II', 113, len(uuid) + 8))
exth.write(uuid)
nrecs += 1
# Insert KF8 records
self.records.append(b'BOUNDARY')
kf8_header_index = len(self.records)
self.kf8.start_offset = (self.serializer.start_offset,
self.kf8.start_offset)
self.records.append(self.kf8.record0)
self.records.extend(self.kf8.records[1:])
# Write cdetype
if not self.is_periodical:
if not self.opts.share_not_sync:
exth.write(pack(b'>II', 501, 12))
exth.write(b'EBOK')
nrecs += 1
else:
ids = {0x101:b'NWPR', 0x103:b'MAGZ'}.get(mobi_doctype, None)
if ids:
exth.write(pack(b'>II', 501, 12))
exth.write(ids)
nrecs += 1
first_image_record = (first_image_record if first_image_record else
len(self.records))
# Add a publication date entry
if oeb.metadata['date']:
datestr = str(oeb.metadata['date'][0])
elif oeb.metadata['timestamp']:
datestr = str(oeb.metadata['timestamp'][0])
header_fields = {k:getattr(self.kf8, k) for k in HEADER_FIELDS}
if datestr is None:
raise ValueError("missing date or timestamp")
# Now change the header fields that need to be different in the MOBI 6
# header
header_fields['first_resource_record'] = first_image_record
header_fields['exth_flags'] = 0b100001010000 # Kinglegen uses this
header_fields['fdst_record'] = NULL_INDEX
header_fields['fdst_count'] = 1 # Why not 0? Kindlegen uses 1
extra_data_flags = 0b1 # Has multibyte overlap bytes
if self.primary_index_record_idx is not None:
extra_data_flags |= 0b10
header_fields['extra_data_flags'] = extra_data_flags
datestr = bytes(datestr)
exth.write(pack(b'>II', EXTH_CODES['pubdate'], len(datestr) + 8))
exth.write(datestr)
nrecs += 1
if self.is_periodical:
exth.write(pack(b'>II', EXTH_CODES['lastupdatetime'], len(datestr) + 8))
exth.write(datestr)
nrecs += 1
for k, v in {'last_text_record':'last_text_record_idx',
'first_non_text_record':'first_non_text_record_idx',
'ncx_index':'primary_index_record_idx',
}.iteritems():
header_fields[k] = getattr(self, v)
if header_fields['ncx_index'] is None:
header_fields['ncx_index'] = NULL_INDEX
if self.is_periodical:
# Pretend to be amazon's super secret periodical generator
vals = {204:201, 205:2, 206:0, 207:101}
else:
# Pretend to be kindlegen 1.2
vals = {204:201, 205:1, 206:2, 207:33307}
for code, val in vals.iteritems():
exth.write(pack(b'>III', code, 12, val))
nrecs += 1
for x in ('skel', 'chunk', 'guide'):
header_fields[x+'_index'] = NULL_INDEX
if self.cover_offset is not None:
exth.write(pack(b'>III', EXTH_CODES['coveroffset'], 12,
self.cover_offset))
exth.write(pack(b'>III', EXTH_CODES['hasfakecover'], 12, 0))
nrecs += 2
if self.thumbnail_offset is not None:
exth.write(pack(b'>III', EXTH_CODES['thumboffset'], 12,
self.thumbnail_offset))
nrecs += 1
# Create the MOBI 6 EXTH
opts = self.opts
kuc = 0 if resource_record_count > 0 else None
if self.serializer.start_offset is not None:
exth.write(pack(b'>III', EXTH_CODES['startreading'], 12,
self.serializer.start_offset))
nrecs += 1
header_fields['exth'] = build_exth(self.oeb.metadata,
prefer_author_sort=opts.prefer_author_sort,
is_periodical=opts.mobi_periodical,
share_not_sync=opts.share_not_sync,
cover_offset=self.cover_offset,
thumbnail_offset=self.thumbnail_offset,
num_of_resources=resource_record_count,
kf8_unknown_count=kuc, be_kindlegen2=True,
kf8_header_index=kf8_header_index,
start_offset=self.serializer.start_offset,
mobi_doctype=2)
self.records[0] = MOBIHeader(file_version=6)(**header_fields)
exth = exth.getvalue()
trail = len(exth) % 4
pad = b'\0' * (4 - trail) # Always pad w/ at least 1 byte
exth = [b'EXTH', pack(b'>II', len(exth) + 12, nrecs), exth, pad]
return b''.join(exth)
# }}}
def write_header(self): # PalmDB header {{{

View File

@ -0,0 +1,136 @@
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import (unicode_literals, division, absolute_import,
print_function)
__license__ = 'GPL v3'
__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import imghdr
from calibre.ebooks.mobi import MAX_THUMB_DIMEN, MAX_THUMB_SIZE
from calibre.ebooks.mobi.utils import (rescale_image, mobify_image,
write_font_record)
from calibre.ebooks import generate_masthead
from calibre.ebooks.oeb.base import OEB_RASTER_IMAGES
PLACEHOLDER_GIF = b'GIF89a\x01\x00\x01\x00\x80\x00\x00\x00\x00\x00\xff\xff\xff!\xf9\x04\x01\x00\x00\x00\x00,\x00\x00\x00\x00\x01\x00\x01\x00@\x02\x01D\x00;'
class Resources(object):
def __init__(self, oeb, opts, is_periodical, add_fonts=False):
self.oeb, self.log, self.opts = oeb, oeb.log, opts
self.is_periodical = is_periodical
self.item_map = {}
self.records = []
self.mime_map = {}
self.masthead_offset = 0
self.used_image_indices = set()
self.image_indices = set()
self.cover_offset = self.thumbnail_offset = None
self.add_resources(add_fonts)
def process_image(self, data):
return (mobify_image(data) if self.opts.mobi_keep_original_images else
rescale_image(data))
def add_resources(self, add_fonts):
oeb = self.oeb
oeb.logger.info('Serializing resources...')
index = 1
mh_href = None
if 'masthead' in oeb.guide and oeb.guide['masthead'].href:
mh_href = oeb.guide['masthead'].href
self.records.append(None)
index += 1
self.used_image_indices.add(0)
self.image_indices.add(0)
elif self.is_periodical:
# Generate a default masthead
data = generate_masthead(unicode(self.oeb.metadata['title'][0]))
self.records.append(data)
self.used_image_indices.add(0)
self.image_indices.add(0)
index += 1
cover_href = self.cover_offset = self.thumbnail_offset = None
if (oeb.metadata.cover and
unicode(oeb.metadata.cover[0]) in oeb.manifest.ids):
cover_id = unicode(oeb.metadata.cover[0])
item = oeb.manifest.ids[cover_id]
cover_href = item.href
for item in self.oeb.manifest.values():
if item.media_type not in OEB_RASTER_IMAGES: continue
try:
data = self.process_image(item.data)
except:
self.log.warn('Bad image file %r' % item.href)
continue
else:
if mh_href and item.href == mh_href:
self.records[0] = data
continue
self.image_indices.add(len(self.records))
self.records.append(data)
self.item_map[item.href] = index
self.mime_map[item.href] = 'image/%s'%imghdr.what(None, data)
index += 1
if cover_href and item.href == cover_href:
self.cover_offset = self.item_map[item.href] - 1
self.used_image_indices.add(self.cover_offset)
try:
data = rescale_image(item.data, dimen=MAX_THUMB_DIMEN,
maxsizeb=MAX_THUMB_SIZE)
except:
self.log.warn('Failed to generate thumbnail')
else:
self.image_indices.add(len(self.records))
self.records.append(data)
self.thumbnail_offset = index - 1
self.used_image_indices.add(self.thumbnail_offset)
index += 1
finally:
item.unload_data_from_memory()
if add_fonts:
for item in self.oeb.manifest.values():
if item.href and item.href.rpartition('.')[-1].lower() in {
'ttf', 'otf'} and isinstance(item.data, bytes):
self.records.append(write_font_record(item.data))
self.item_map[item.href] = len(self.records)
def add_extra_images(self):
'''
Add any images that were created after the call to add_resources()
'''
for item in self.oeb.manifest.values():
if (item.media_type not in OEB_RASTER_IMAGES or item.href in
self.item_map): continue
try:
data = self.process_image(item.data)
except:
self.log.warn('Bad image file %r' % item.href)
else:
self.records.append(data)
self.item_map[item.href] = len(self.records)
finally:
item.unload_data_from_memory()
def serialize(self, records, used_images):
used_image_indices = self.used_image_indices | {
v-1 for k, v in self.item_map.iteritems() if k in used_images}
for i in self.image_indices-used_image_indices:
self.records[i] = PLACEHOLDER_GIF
records.extend(self.records)
def __bool__(self):
return bool(self.records)
__nonzero__ = __bool__

View File

@ -12,6 +12,7 @@ import re
from calibre.ebooks.oeb.base import (OEB_DOCS, XHTML, XHTML_NS, XML_NS,
namespace, prefixname, urlnormalize)
from calibre.ebooks.mobi.mobiml import MBP_NS
from calibre.ebooks.mobi.utils import is_guide_ref_start
from collections import defaultdict
from urlparse import urldefrag
@ -39,6 +40,7 @@ class Serializer(object):
self.oeb = oeb
# Map of image hrefs to image index in the MOBI file
self.images = images
self.used_images = set()
self.logger = oeb.logger
self.is_periodical = is_periodical
self.write_page_breaks_after_item = write_page_breaks_after_item
@ -160,9 +162,7 @@ class Serializer(object):
buf.write(b'title="')
self.serialize_text(ref.title, quot=True)
buf.write(b'" ')
if (ref.title.lower() == 'start' or
(ref.type and ref.type.lower() in {'start',
'other.start', 'text'})):
if is_guide_ref_start(ref):
self._start_href = ref.href
self.serialize_href(ref.href)
# Space required or won't work, I kid you not
@ -329,6 +329,7 @@ class Serializer(object):
href = urlnormalize(item.abshref(val))
if href in self.images:
index = self.images[href]
self.used_images.add(href)
buf.write(b'recindex="%05d"' % index)
continue
buf.write(attr.encode('utf-8'))

View File

@ -0,0 +1,11 @@
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import (unicode_literals, division, absolute_import,
print_function)
__license__ = 'GPL v3'
__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'

View File

@ -0,0 +1,188 @@
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import (unicode_literals, division, absolute_import,
print_function)
__license__ = 'GPL v3'
__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import re
from struct import pack
from io import BytesIO
from calibre.ebooks.mobi.utils import utf8_text
EXTH_CODES = {
'creator': 100,
'publisher': 101,
'description': 103,
'identifier': 104,
'subject': 105,
'pubdate': 106,
'review': 107,
'contributor': 108,
'rights': 109,
'type': 111,
'source': 112,
'versionnumber': 114,
'startreading': 116,
'kf8_header_index': 121,
'num_of_resources': 125,
'kf8_unknown_count': 131,
'coveroffset': 201,
'thumboffset': 202,
'hasfakecover': 203,
'lastupdatetime': 502,
'title': 503,
}
COLLAPSE_RE = re.compile(r'[ \t\r\n\v]+')
def build_exth(metadata, prefer_author_sort=False, is_periodical=False,
share_not_sync=True, cover_offset=None, thumbnail_offset=None,
start_offset=None, mobi_doctype=2, num_of_resources=None,
kf8_unknown_count=0, be_kindlegen2=False, kf8_header_index=None):
exth = BytesIO()
nrecs = 0
for term in metadata:
if term not in EXTH_CODES: continue
code = EXTH_CODES[term]
items = metadata[term]
if term == 'creator':
if prefer_author_sort:
creators = [unicode(c.file_as or c) for c in
items][:1]
else:
creators = [unicode(c) for c in items]
items = ['; '.join(creators)]
for item in items:
data = unicode(item)
if term != 'description':
data = COLLAPSE_RE.sub(' ', data)
if term == 'identifier':
if data.lower().startswith('urn:isbn:'):
data = data[9:]
elif item.scheme.lower() == 'isbn':
pass
else:
continue
data = utf8_text(data)
exth.write(pack(b'>II', code, len(data) + 8))
exth.write(data)
nrecs += 1
if term == 'rights' :
try:
rights = utf8_text(unicode(metadata.rights[0]))
except:
rights = b'Unknown'
exth.write(pack(b'>II', EXTH_CODES['rights'], len(rights) + 8))
exth.write(rights)
nrecs += 1
# Write UUID as ASIN
uuid = None
from calibre.ebooks.oeb.base import OPF
for x in metadata['identifier']:
if (x.get(OPF('scheme'), None).lower() == 'uuid' or
unicode(x).startswith('urn:uuid:')):
uuid = unicode(x).split(':')[-1]
break
if uuid is None:
from uuid import uuid4
uuid = str(uuid4())
if isinstance(uuid, unicode):
uuid = uuid.encode('utf-8')
if not share_not_sync:
exth.write(pack(b'>II', 113, len(uuid) + 8))
exth.write(uuid)
nrecs += 1
# Write cdetype
if not is_periodical:
if not share_not_sync:
exth.write(pack(b'>II', 501, 12))
exth.write(b'EBOK')
nrecs += 1
else:
ids = {0x101:b'NWPR', 0x103:b'MAGZ'}.get(mobi_doctype, None)
if ids:
exth.write(pack(b'>II', 501, 12))
exth.write(ids)
nrecs += 1
# Add a publication date entry
if metadata['date']:
datestr = str(metadata['date'][0])
elif metadata['timestamp']:
datestr = str(metadata['timestamp'][0])
if datestr is None:
raise ValueError("missing date or timestamp")
datestr = bytes(datestr)
exth.write(pack(b'>II', EXTH_CODES['pubdate'], len(datestr) + 8))
exth.write(datestr)
nrecs += 1
if is_periodical:
exth.write(pack(b'>II', EXTH_CODES['lastupdatetime'], len(datestr) + 8))
exth.write(datestr)
nrecs += 1
if be_kindlegen2:
vals = {204:201, 205:2, 206:2, 207:35621}
elif is_periodical:
# Pretend to be amazon's super secret periodical generator
vals = {204:201, 205:2, 206:0, 207:101}
else:
# Pretend to be kindlegen 1.2
vals = {204:201, 205:1, 206:2, 207:33307}
for code, val in vals.iteritems():
exth.write(pack(b'>III', code, 12, val))
nrecs += 1
if cover_offset is not None:
exth.write(pack(b'>III', EXTH_CODES['coveroffset'], 12,
cover_offset))
exth.write(pack(b'>III', EXTH_CODES['hasfakecover'], 12, 0))
nrecs += 2
if thumbnail_offset is not None:
exth.write(pack(b'>III', EXTH_CODES['thumboffset'], 12,
thumbnail_offset))
nrecs += 1
if start_offset is not None:
try:
len(start_offset)
except TypeError:
start_offset = [start_offset]
for so in start_offset:
if so is not None:
exth.write(pack(b'>III', EXTH_CODES['startreading'], 12,
so))
nrecs += 1
if kf8_header_index is not None:
exth.write(pack(b'>III', EXTH_CODES['kf8_header_index'], 12,
kf8_header_index))
nrecs += 1
if num_of_resources is not None:
exth.write(pack(b'>III', EXTH_CODES['num_of_resources'], 12,
num_of_resources))
nrecs += 1
if kf8_unknown_count is not None:
exth.write(pack(b'>III', EXTH_CODES['kf8_unknown_count'], 12,
kf8_unknown_count))
nrecs += 1
exth = exth.getvalue()
trail = len(exth) % 4
pad = b'\0' * (4 - trail) # Always pad w/ at least 1 byte
exth = [b'EXTH', pack(b'>II', len(exth) + 12, nrecs), exth, pad]
return b''.join(exth)

View File

@ -0,0 +1,86 @@
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import (unicode_literals, division, absolute_import,
print_function)
__license__ = 'GPL v3'
__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import random
from io import BytesIO
from collections import OrderedDict
from struct import pack
from calibre.ebooks.mobi.utils import align_block
NULL = 0xffffffff
zeroes = lambda x: b'\0'*x
nulls = lambda x: b'\xff'*x
short = lambda x: pack(b'>H', x)
class Header(OrderedDict):
HEADER_NAME = b''
DEFINITION = '''
'''
ALIGN_BLOCK = False
POSITIONS = {} # Mapping of position field to field whose position should
# be stored in the position field
SHORT_FIELDS = set()
def __init__(self):
OrderedDict.__init__(self)
for line in self.DEFINITION.splitlines():
line = line.strip()
if not line or line.startswith('#'): continue
name, val = [x.strip() for x in line.partition('=')[0::2]]
if val:
val = eval(val, {'zeroes':zeroes, 'NULL':NULL, 'DYN':None,
'nulls':nulls, 'short':short, 'random':random})
else:
val = 0
if name in self:
raise ValueError('Duplicate field in definition: %r'%name)
self[name] = val
@property
def dynamic_fields(self):
return tuple(k for k, v in self.iteritems() if v is None)
def __call__(self, **kwargs):
positions = {}
for name, val in kwargs.iteritems():
if name not in self:
raise KeyError('Not a valid header field: %r'%name)
self[name] = val
buf = BytesIO()
buf.write(bytes(self.HEADER_NAME))
for name, val in self.iteritems():
val = self.format_value(name, val)
positions[name] = buf.tell()
if val is None:
raise ValueError('Dynamic field %r not set'%name)
if isinstance(val, (int, long)):
fmt = 'H' if name in self.SHORT_FIELDS else 'I'
val = pack(b'>'+fmt, val)
buf.write(val)
for pos_field, field in self.POSITIONS.iteritems():
buf.seek(positions[pos_field])
buf.write(pack(b'>I', positions[field]))
ans = buf.getvalue()
if self.ALIGN_BLOCK:
ans = align_block(ans)
return ans
def format_value(self, name, val):
return val

View File

@ -0,0 +1,335 @@
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import (unicode_literals, division, absolute_import,
print_function)
from future_builtins import map
__license__ = 'GPL v3'
__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
from collections import namedtuple
from struct import pack
from io import BytesIO
from calibre.ebooks.mobi.utils import CNCX, encint, align_block
from calibre.ebooks.mobi.writer8.header import Header
TagMeta_ = namedtuple('TagMeta',
'name number values_per_entry bitmask end_flag')
TagMeta = lambda x:TagMeta_(*x)
EndTagTable = TagMeta(('eof', 0, 0, 0, 1))
# map of mask to number of shifts needed, works with 1 bit and two-bit wide masks
# could also be extended to 4 bit wide ones as well
mask_to_bit_shifts = { 1:0, 2:1, 3:0, 4:2, 8:3, 12:2, 16:4, 32:5, 48:4, 64:6,
128:7, 192: 6 }
class IndexHeader(Header): # {{{
HEADER_NAME = b'INDX'
ALIGN_BLOCK = True
HEADER_LENGTH = 192
DEFINITION = '''
# 4 - 8: Header Length
header_length = {header_length}
# 8 - 16: Unknown
unknown1 = zeroes(8)
# 16 - 20: Index type: 0 - normal 2 - inflection
type = 2
# 20 - 24: IDXT offset (filled in later)
idxt_offset
# 24 - 28: Number of index records
num_of_records = 1
# 28 - 32: Index encoding (65001 = utf-8)
encoding = 65001
# 32 - 36: Unknown
unknown2 = NULL
# 36 - 40: Number of Index entries
num_of_entries = DYN
# 40 - 44: ORDT offset
ordt_offset
# 44 - 48: LIGT offset
ligt_offset
# 48 - 52: Number of ORDT/LIGT? entries
num_of_ordt_entries
# 52 - 56: Number of CNCX records
num_of_cncx = DYN
# 56 - 180: Unknown
unknown3 = zeroes(124)
# 180 - 184: TAGX offset
tagx_offset = {header_length}
# 184 - 192: Unknown
unknown4 = zeroes(8)
# TAGX
tagx = DYN
# Last Index entry
last_index = DYN
# IDXT
idxt = DYN
'''.format(header_length=HEADER_LENGTH)
POSITIONS = {'idxt_offset':'idxt'}
# }}}
class Index(object): # {{{
control_byte_count = 1
cncx = CNCX()
tag_types = (EndTagTable,)
HEADER_LENGTH = IndexHeader.HEADER_LENGTH
@classmethod
def generate_tagx(cls):
header = b'TAGX'
byts = bytearray()
for tag_meta in cls.tag_types:
byts.extend(tag_meta[1:])
# table length, control byte count
header += pack(b'>II', 12+len(byts), cls.control_byte_count)
return header + bytes(byts)
@classmethod
def calculate_control_bytes_for_each_entry(cls, entries):
control_bytes = []
for lead_text, tags in entries:
cbs = []
ans = 0
for (name, number, vpe, mask, endi) in cls.tag_types:
if endi == 1:
cbs.append(ans)
ans = 0
continue
try:
nvals = len(tags.get(name, ()))
except TypeError:
nvals = 1
nentries = nvals // vpe
shifts = mask_to_bit_shifts[mask]
ans |= mask & (nentries << shifts)
if len(cbs) != cls.control_byte_count:
raise ValueError('The entry %r is invalid'%[lead_text, tags])
control_bytes.append(cbs)
return control_bytes
def __call__(self):
self.control_bytes = self.calculate_control_bytes_for_each_entry(
self.entries)
rendered_entries = []
index, idxt, buf = BytesIO(), BytesIO(), BytesIO()
IndexEntry = namedtuple('IndexEntry', 'offset length raw')
last_lead_text = b''
too_large = ValueError('Index has too many entries, calibre does not'
' support generating multiple index records at this'
' time.')
for i, x in enumerate(self.entries):
control_bytes = self.control_bytes[i]
leading_text, tags = x
buf.seek(0), buf.truncate(0)
leading_text = (leading_text.encode('utf-8') if
isinstance(leading_text, unicode) else leading_text)
raw = bytearray(leading_text)
raw.insert(0, len(leading_text))
buf.write(bytes(raw))
buf.write(bytes(bytearray(control_bytes)))
for tag in self.tag_types:
values = tags.get(tag.name, None)
if values is None: continue
try:
len(values)
except TypeError:
values = [values]
if values:
for val in values:
try:
buf.write(encint(val))
except ValueError:
raise ValueError('Invalid values for %r: %r'%(
tag, values))
raw = buf.getvalue()
offset = index.tell()
if offset + self.HEADER_LENGTH >= 0x10000:
raise too_large
rendered_entries.append(IndexEntry(offset, len(raw), raw))
idxt.write(pack(b'>H', self.HEADER_LENGTH+offset))
index.write(raw)
last_lead_text = leading_text
index_block = align_block(index.getvalue())
idxt_block = align_block(b'IDXT' + idxt.getvalue())
body = index_block + idxt_block
if len(body) + self.HEADER_LENGTH >= 0x10000:
raise too_large
header = b'INDX'
buf.seek(0), buf.truncate(0)
buf.write(pack(b'>I', self.HEADER_LENGTH))
buf.write(b'\0'*4) # Unknown
buf.write(pack(b'>I', 1)) # Header type? Or index record number?
buf.write(b'\0'*4) # Unknown
# IDXT block offset
buf.write(pack(b'>I', self.HEADER_LENGTH + len(index_block)))
# Number of index entries
buf.write(pack(b'>I', len(rendered_entries)))
buf.write(b'\xff'*8) # Unknown
buf.write(b'\0'*156) # Unknown
header += buf.getvalue()
index_record = header + body
tagx = self.generate_tagx()
idxt = (b'IDXT' + pack(b'>H', IndexHeader.HEADER_LENGTH + len(tagx)) +
b'\0')
# Last index
idx = bytes(bytearray([len(last_lead_text)])) + last_lead_text
idx += pack(b'>H', len(rendered_entries))
header = {
'num_of_entries': len(rendered_entries),
'num_of_cncx': len(self.cncx),
'tagx':tagx,
'last_index':align_block(idx),
'idxt':idxt
}
header = IndexHeader()(**header)
self.records = [header, index_record]
self.records.extend(self.cncx.records)
return self.records
# }}}
class SkelIndex(Index):
tag_types = tuple(map(TagMeta, (
('chunk_count', 1, 1, 3, 0),
('geometry', 6, 2, 12, 0),
EndTagTable
)))
def __init__(self, skel_table):
self.entries = [
(s.name, {
# Dont ask me why these entries have to be repeated twice
'chunk_count':(s.chunk_count, s.chunk_count),
'geometry':(s.start_pos, s.length, s.start_pos, s.length),
}) for s in skel_table
]
class ChunkIndex(Index):
tag_types = tuple(map(TagMeta, (
('cncx_offset', 2, 1, 1, 0),
('file_number', 3, 1, 2, 0),
('sequence_number', 4, 1, 4, 0),
('geometry', 6, 2, 8, 0),
EndTagTable
)))
def __init__(self, chunk_table):
self.cncx = CNCX(c.selector for c in chunk_table)
self.entries = [
('%010d'%c.insert_pos, {
'cncx_offset':self.cncx[c.selector],
'file_number':c.file_number,
'sequence_number':c.sequence_number,
'geometry':(c.start_pos, c.length),
}) for c in chunk_table
]
class GuideIndex(Index):
tag_types = tuple(map(TagMeta, (
('title', 1, 1, 1, 0),
('pos_fid', 6, 2, 2, 0),
EndTagTable
)))
def __init__(self, guide_table):
self.cncx = CNCX(c.title for c in guide_table)
self.entries = [
(r.type, {
'title':self.cncx[r.title],
'pos_fid':r.pos_fid,
}) for r in guide_table
]
class NCXIndex(Index):
''' The commented out parts have been seen in NCX indexes from MOBI 6
periodicals. Since we have no MOBI 8 periodicals to reverse engineer, leave
it for now. '''
# control_byte_count = 2
tag_types = tuple(map(TagMeta, (
('offset', 1, 1, 1, 0),
('length', 2, 1, 2, 0),
('label', 3, 1, 4, 0),
('depth', 4, 1, 8, 0),
('parent', 21, 1, 16, 0),
('first_child', 22, 1, 32, 0),
('last_child', 23, 1, 64, 0),
('pos_fid', 6, 2, 128, 0),
EndTagTable,
# ('image', 69, 1, 1, 0),
# ('description', 70, 1, 2, 0),
# ('author', 71, 1, 4, 0),
# ('caption', 72, 1, 8, 0),
# ('attribution', 73, 1, 16, 0),
# EndTagTable
)))
def __init__(self, toc_table):
strings = []
for entry in toc_table:
strings.append(entry['label'])
aut = entry.get('author', None)
if aut:
strings.append(aut)
desc = entry.get('description', None)
if desc:
strings.append(desc)
self.cncx = CNCX(strings)
def to_entry(x):
ans = {}
for f in ('offset', 'length', 'depth', 'pos_fid', 'parent',
'first_child', 'last_child'):
if f in x:
ans[f] = x[f]
for f in ('label', 'description', 'author'):
if f in x:
ans[f] = self.cncx[x[f]]
return ('%02x'%x['index'], ans)
self.entries = list(map(to_entry, toc_table))

View File

@ -0,0 +1,406 @@
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import (unicode_literals, division, absolute_import,
print_function)
__license__ = 'GPL v3'
__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import copy, logging
from functools import partial
from collections import defaultdict, namedtuple
from io import BytesIO
from struct import pack
import cssutils
from lxml import etree
from calibre import isbytestring, force_unicode
from calibre.ebooks.mobi.utils import (create_text_record, to_base,
is_guide_ref_start)
from calibre.ebooks.compression.palmdoc import compress_doc
from calibre.ebooks.oeb.base import (OEB_DOCS, OEB_STYLES, SVG_MIME, XPath,
extract, XHTML, urlnormalize)
from calibre.ebooks.oeb.parse_utils import barename
from calibre.ebooks.mobi.writer8.skeleton import Chunker, aid_able_tags, to_href
from calibre.ebooks.mobi.writer8.index import (NCXIndex, SkelIndex,
ChunkIndex, GuideIndex)
from calibre.ebooks.mobi.writer8.mobi import KF8Book
from calibre.ebooks.mobi.writer8.tbs import apply_trailing_byte_sequences
from calibre.ebooks.mobi.writer8.toc import TOCAdder
XML_DOCS = OEB_DOCS | {SVG_MIME}
# References to record numbers in KF8 are stored as base-32 encoded integers,
# with 4 digits
to_ref = partial(to_base, base=32, min_num_digits=4)
class KF8Writer(object):
def __init__(self, oeb, opts, resources):
self.oeb, self.opts, self.log = oeb, opts, oeb.log
self.compress = not self.opts.dont_compress
self.has_tbs = False
self.log.info('Creating KF8 output')
# Create an inline ToC if one does not already exist
self.toc_adder = TOCAdder(oeb, opts)
self.used_images = set()
self.resources = resources
self.flows = [None] # First flow item is reserved for the text
self.records = [None] # Placeholder for zeroth record
self.log('\tGenerating KF8 markup...')
self.dup_data()
self.replace_resource_links()
self.extract_css_into_flows()
self.extract_svg_into_flows()
self.replace_internal_links_with_placeholders()
self.insert_aid_attributes()
self.chunk_it_up()
# Dump the cloned data as it is no longer needed
del self._data_cache
self.create_text_records()
self.log('\tCreating indices...')
self.create_fdst_records()
self.create_indices()
self.create_guide()
# We do not want to use this ToC for MOBI 6, so remove it
self.toc_adder.remove_generated_toc()
def dup_data(self):
''' Duplicate data so that any changes we make to markup/CSS only
affect KF8 output and not MOBI 6 output '''
self._data_cache = {}
# Suppress cssutils logging output as it is duplicated anyway earlier
# in the pipeline
cssutils.log.setLevel(logging.CRITICAL)
for item in self.oeb.manifest:
if item.media_type in XML_DOCS:
self._data_cache[item.href] = copy.deepcopy(item.data)
elif item.media_type in OEB_STYLES:
# I can't figure out how to make an efficient copy of the
# in-memory CSSStylesheet, as deepcopy doesn't work (raises an
# exception)
self._data_cache[item.href] = cssutils.parseString(
item.data.cssText, validate=False)
def data(self, item):
return self._data_cache.get(item.href, item.data)
def replace_resource_links(self):
''' Replace links to resources (raster images/fonts) with pointers to
the MOBI record containing the resource. The pointers are of the form:
kindle:embed:XXXX?mime=image/* The ?mime= is apparently optional and
not used for fonts. '''
def pointer(item, oref):
ref = item.abshref(oref)
idx = self.resources.item_map.get(ref, None)
if idx is not None:
is_image = self.resources.records[idx-1][:4] not in {b'FONT'}
idx = to_ref(idx)
if is_image:
self.used_images.add(ref)
return 'kindle:embed:%s?mime=%s'%(idx,
self.resources.mime_map[ref])
else:
return 'kindle:embed:%s'%idx
return oref
for item in self.oeb.manifest:
if item.media_type in XML_DOCS:
root = self.data(item)
for tag in XPath('//h:img|//svg:image')(root):
for attr, ref in tag.attrib.iteritems():
if attr.split('}')[-1].lower() in {'src', 'href'}:
tag.attrib[attr] = pointer(item, ref)
for tag in XPath('//h:style')(root):
if tag.text:
sheet = cssutils.parseString(tag.text, validate=False)
replacer = partial(pointer, item)
cssutils.replaceUrls(sheet, replacer,
ignoreImportRules=True)
repl = sheet.cssText
if isbytestring(repl):
repl = repl.decode('utf-8')
tag.text = '\n'+ repl + '\n'
elif item.media_type in OEB_STYLES:
sheet = self.data(item)
replacer = partial(pointer, item)
cssutils.replaceUrls(sheet, replacer, ignoreImportRules=True)
def extract_css_into_flows(self):
inlines = defaultdict(list) # Ensure identical <style>s not repeated
sheets = {}
for item in self.oeb.manifest:
if item.media_type in OEB_STYLES:
data = self.data(item).cssText
sheets[item.href] = len(self.flows)
self.flows.append(force_unicode(data, 'utf-8'))
for item in self.oeb.spine:
root = self.data(item)
for link in XPath('//h:link[@href]')(root):
href = item.abshref(link.get('href'))
idx = sheets.get(href, None)
if idx is not None:
idx = to_ref(idx)
link.set('href', 'kindle:flow:%s?mime=text/css'%idx)
for tag in XPath('//h:style')(root):
p = tag.getparent()
idx = p.index(tag)
raw = tag.text
if not raw or not raw.strip():
extract(tag)
continue
repl = etree.Element(XHTML('link'), type='text/css',
rel='stylesheet')
repl.tail='\n'
p.insert(idx, repl)
extract(tag)
inlines[raw].append(repl)
for raw, elems in inlines.iteritems():
idx = to_ref(len(self.flows))
self.flows.append(raw)
for link in elems:
link.set('href', 'kindle:flow:%s?mime=text/css'%idx)
def extract_svg_into_flows(self):
images = {}
for item in self.oeb.manifest:
if item.media_type == SVG_MIME:
data = self.data(item)
images[item.href] = len(self.flows)
self.flows.append(etree.tostring(data, encoding='UTF-8',
with_tail=True, xml_declaration=True))
for item in self.oeb.spine:
root = self.data(item)
for svg in XPath('//svg:svg')(root):
raw = etree.tostring(svg, encoding=unicode, with_tail=False)
idx = len(self.flows)
self.flows.append(raw)
p = svg.getparent()
pos = p.index(svg)
img = etree.Element(XHTML('img'),
src="kindle:flow:%s?mime=image/svg+xml"%to_ref(idx))
p.insert(pos, img)
extract(svg)
for img in XPath('//h:img[@src]')(root):
src = img.get('src')
abshref = item.abshref(src)
idx = images.get(abshref, None)
if idx is not None:
img.set('src', 'kindle:flow:%s?mime=image/svg+xml'%
to_ref(idx))
def replace_internal_links_with_placeholders(self):
self.link_map = {}
count = 0
hrefs = {item.href for item in self.oeb.spine}
for item in self.oeb.spine:
root = self.data(item)
for a in XPath('//h:a[@href]')(root):
count += 1
ref = item.abshref(a.get('href'))
href, _, frag = ref.partition('#')
href = urlnormalize(href)
if href in hrefs:
placeholder = 'kindle:pos:fid:0000:off:%s'%to_href(count)
self.link_map[placeholder] = (href, frag)
a.set('href', placeholder)
def insert_aid_attributes(self):
self.id_map = {}
for i, item in enumerate(self.oeb.spine):
root = self.data(item)
aidbase = i * int(1e6)
j = 0
for tag in root.iterdescendants(etree.Element):
id_ = tag.attrib.get('id', None)
if id_ is not None or barename(tag.tag).lower() in aid_able_tags:
aid = aidbase + j
tag.attrib['aid'] = to_base(aid, base=32)
if tag.tag == XHTML('body'):
self.id_map[(item.href, '')] = tag.attrib['aid']
if id_ is not None:
self.id_map[(item.href, id_)] = tag.attrib['aid']
j += 1
def chunk_it_up(self):
placeholder_map = {}
for placeholder, x in self.link_map.iteritems():
href, frag = x
aid = self.id_map.get(x, None)
if aid is None:
aid = self.id_map.get((href, ''))
placeholder_map[placeholder] = aid
chunker = Chunker(self.oeb, self.data, placeholder_map)
for x in ('skel_table', 'chunk_table', 'aid_offset_map'):
setattr(self, x, getattr(chunker, x))
self.flows[0] = chunker.text
def create_text_records(self):
self.flows = [x.encode('utf-8') if isinstance(x, unicode) else x for x
in self.flows]
text = b''.join(self.flows)
self.text_length = len(text)
text = BytesIO(text)
nrecords = 0
records_size = 0
if self.compress:
self.oeb.logger.info('\tCompressing markup...')
while text.tell() < self.text_length:
data, overlap = create_text_record(text)
if self.compress:
data = compress_doc(data)
data += overlap
data += pack(b'>B', len(overlap))
self.records.append(data)
records_size += len(data)
nrecords += 1
self.last_text_record_idx = nrecords
self.first_non_text_record_idx = nrecords + 1
# Pad so that the next records starts at a 4 byte boundary
if records_size % 4 != 0:
self.records.append(b'\x00'*(records_size % 4))
self.first_non_text_record_idx += 1
def create_fdst_records(self):
FDST = namedtuple('Flow', 'start end')
entries = []
self.fdst_table = []
for i, flow in enumerate(self.flows):
start = 0 if i == 0 else self.fdst_table[-1].end
self.fdst_table.append(FDST(start, start + len(flow)))
entries.extend(self.fdst_table[-1])
rec = (b'FDST' + pack(b'>LL', 12, len(self.fdst_table)) +
pack(b'>%dL'%len(entries), *entries))
self.fdst_records = [rec]
self.fdst_count = len(self.fdst_table)
def create_indices(self):
self.skel_records = SkelIndex(self.skel_table)()
self.chunk_records = ChunkIndex(self.chunk_table)()
self.ncx_records = []
toc = self.oeb.toc
entries = []
is_periodical = self.opts.mobi_periodical
if toc.count() < 2:
self.log.warn('Document has no ToC, MOBI will have no NCX index')
return
# Flatten the ToC into a depth first list
fl = toc.iter() if is_periodical else toc.iterdescendants()
for i, item in enumerate(fl):
entry = {'id': id(item), 'index': i, 'href':item.href,
'label':(item.title or _('Unknown')),
'children':[]}
entry['depth'] = getattr(item, 'ncx_hlvl', 0)
p = getattr(item, 'ncx_parent', None)
if p is not None:
entry['parent_id'] = p
for child in item:
child.ncx_parent = entry['id']
child.ncx_hlvl = entry['depth'] + 1
entry['children'].append(id(child))
if is_periodical:
if item.author:
entry['author'] = item.author
if item.description:
entry['description'] = item.description
entries.append(entry)
# The Kindle requires entries to be sorted by (depth, playorder)
entries.sort(key=lambda entry: (entry['depth'], entry['index']))
for i, entry in enumerate(entries):
entry['index'] = i
id_to_index = {entry['id']:entry['index'] for entry in entries}
# Write the hierarchical and start offset information
for entry in entries:
children = entry.pop('children')
if children:
entry['first_child'] = id_to_index[children[0]]
entry['last_child'] = id_to_index[children[-1]]
if 'parent_id' in entry:
entry['parent'] = id_to_index[entry.pop('parent_id')]
href = entry.pop('href')
href, frag = href.partition('#')[0::2]
aid = self.id_map.get((href, frag), None)
if aid is None:
aid = self.id_map.get((href, ''), None)
if aid is None:
pos, fid = 0, 0
else:
pos, fid = self.aid_offset_map[aid]
chunk = self.chunk_table[pos]
offset = chunk.insert_pos + fid
entry['pos_fid'] = (pos, fid)
entry['offset'] = offset
# Write the lengths
def get_next_start(entry):
enders = [e['offset'] for e in entries if e['depth'] <=
entry['depth'] and e['offset'] > entry['offset']]
if enders:
return min(enders)
return len(self.flows[0])
for entry in entries:
entry['length'] = get_next_start(entry) - entry['offset']
self.has_tbs = apply_trailing_byte_sequences(entries, self.records,
self.last_text_record_idx+1)
self.ncx_records = NCXIndex(entries)()
def create_guide(self):
self.start_offset = None
self.guide_table = []
self.guide_records = []
GuideRef = namedtuple('GuideRef', 'title type pos_fid')
for ref in self.oeb.guide.values():
href, frag = ref.href.partition('#')[0::2]
aid = self.id_map.get((href, frag), None)
if aid is None:
aid = self.id_map.get((href, ''))
if aid is None:
continue
pos, fid = self.aid_offset_map[aid]
if is_guide_ref_start(ref):
chunk = self.chunk_table[pos]
skel = [s for s in self.skel_table if s.file_number ==
chunk.file_number][0]
self.start_offset = skel.start_pos + skel.length + chunk.start_pos + fid
self.guide_table.append(GuideRef(ref.title or
_('Unknown'), ref.type, (pos, fid)))
if self.guide_table:
self.guide_table.sort(key=lambda x:x.type) # Needed by the Kindle
self.guide_records = GuideIndex(self.guide_table)()
def create_kf8_book(oeb, opts, resources, for_joint=False):
writer = KF8Writer(oeb, opts, resources)
return KF8Book(writer, for_joint=for_joint)

View File

@ -0,0 +1,311 @@
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import (unicode_literals, division, absolute_import,
print_function)
__license__ = 'GPL v3'
__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import time, random
from struct import pack
from calibre.ebooks.mobi.utils import RECORD_SIZE, utf8_text
from calibre.ebooks.mobi.writer8.header import Header
from calibre.ebooks.mobi.writer2 import (PALMDOC, UNCOMPRESSED)
from calibre.ebooks.mobi.langcodes import iana2mobi
from calibre.ebooks.mobi.writer8.exth import build_exth
from calibre.utils.filenames import ascii_filename
NULL_INDEX = 0xffffffff
class MOBIHeader(Header): # {{{
'''
Represents the first record in a MOBI file, contains all the metadata about
the file.
'''
DEFINITION = '''
# 0: Compression
compression = DYN
# 2: Unused
unused1 = zeroes(2)
# 4: Text length
text_length = DYN
# 8: Last text record
last_text_record = DYN
# 10: Text record size
record_size = {record_size}
# 12: Encryption Type
encryption_type
# 14: Unused
unused2
# 16: Ident
ident = b'MOBI'
# 20: Header length
header_length = 248
# 24: Book Type (0x2 - Book, 0x101 - News hierarchical, 0x102 - News
# (flat), 0x103 - News magazine same as 0x101)
book_type = DYN
# 28: Text encoding (utf-8 = 65001)
encoding = 65001
# 32: UID
uid = DYN
# 36: File version
file_version = {file_version}
# 40: Meta orth record (used in dictionaries)
meta_orth_record = NULL
# 44: Meta infl index
meta_infl_index = NULL
# 48: Extra indices
extra_index0 = NULL
extra_index1 = NULL
extra_index2 = NULL
extra_index3 = NULL
extra_index4 = NULL
extra_index5 = NULL
extra_index6 = NULL
extra_index7 = NULL
# 80: First non text record
first_non_text_record = DYN
# 84: Title offset
title_offset
# 88: Title Length
title_length = DYN
# 92: Language code
language_code = DYN
# 96: Dictionary in and out languages
in_lang
out_lang
# 104: Min version
min_version = {file_version}
# 108: First resource record
first_resource_record = DYN
# 112: Huff/CDIC compression
huff_first_record
huff_count
# 120: Unknown (Maybe DATP related, maybe HUFF/CDIC related)
maybe_datp = zeroes(8)
# 128: EXTH flags
exth_flags = DYN
# 132: Unknown
unknown = zeroes(36)
# 168: DRM
drm_offset = NULL
drm_count
drm_size
drm_flags
# 184: Unknown
unknown2 = zeroes(8)
# 192: FDST
fdst_record = DYN
fdst_count = DYN
# 200: FCI
fcis_record = NULL
fcis_count
# 208: FLIS
flis_record = NULL
flis_count
# 216: Unknown
unknown3 = zeroes(8)
# 224: SRCS
srcs_record = NULL
srcs_count
# 232: Unknown
unknown4 = nulls(8)
# 240: Extra data flags
# 0b1 - extra multibyte bytes after text records
# 0b10 - TBS indexing data (only used in MOBI 6)
# 0b100 - uncrossable breaks only used in MOBI 6
extra_data_flags = DYN
# 244: KF8 Indices
ncx_index = DYN
chunk_index = DYN
skel_index = DYN
datp_index = NULL
guide_index = DYN
# 264: EXTH
exth = DYN
# Full title
full_title = DYN
# Padding to allow amazon's DTP service to add data
padding = zeroes(8192)
'''
SHORT_FIELDS = {'compression', 'last_text_record', 'record_size',
'encryption_type', 'unused2'}
ALIGN = True
POSITIONS = {'title_offset':'full_title'}
def __init__(self, file_version=8):
self.DEFINITION = self.DEFINITION.format(file_version=file_version,
record_size=RECORD_SIZE)
super(MOBIHeader, self).__init__()
def format_value(self, name, val):
if name == 'compression':
val = PALMDOC if val else UNCOMPRESSED
return super(MOBIHeader, self).format_value(name, val)
# }}}
HEADER_FIELDS = {'compression', 'text_length', 'last_text_record', 'book_type',
'first_non_text_record', 'title_length', 'language_code',
'first_resource_record', 'exth_flags', 'fdst_record',
'fdst_count', 'ncx_index', 'chunk_index', 'skel_index',
'guide_index', 'exth', 'full_title', 'extra_data_flags',
'uid'}
class KF8Book(object):
def __init__(self, writer, for_joint=False):
self.build_records(writer, for_joint)
self.used_images = writer.used_images
def build_records(self, writer, for_joint):
metadata = writer.oeb.metadata
# The text records
for x in ('last_text_record_idx', 'first_non_text_record_idx'):
setattr(self, x.rpartition('_')[0], getattr(writer, x))
self.records = writer.records
self.text_length = writer.text_length
# KF8 Indices
self.chunk_index = len(self.records)
self.records.extend(writer.chunk_records)
self.skel_index = len(self.records)
self.records.extend(writer.skel_records)
self.guide_index = NULL_INDEX
if writer.guide_records:
self.guide_index = len(self.records)
self.records.extend(writer.guide_records)
self.ncx_index = NULL_INDEX
if writer.ncx_records:
self.ncx_index = len(self.records)
self.records.extend(writer.ncx_records)
# Resources
resources = writer.resources
for x in ('cover_offset', 'thumbnail_offset', 'masthead_offset'):
setattr(self, x, getattr(resources, x))
self.first_resource_record = NULL_INDEX
before = len(self.records)
if resources.records:
self.first_resource_record = len(self.records)
if not for_joint:
resources.serialize(self.records, writer.used_images)
self.num_of_resources = len(self.records) - before
# FDST
self.fdst_count = writer.fdst_count
self.fdst_record = len(self.records)
self.records.extend(writer.fdst_records)
# EOF
self.records.append(b'\xe9\x8e\r\n') # EOF record
# Miscellaneous header fields
self.compression = writer.compress
self.book_type = 0x101 if writer.opts.mobi_periodical else 2
self.full_title = utf8_text(unicode(metadata.title[0]))
self.title_length = len(self.full_title)
self.extra_data_flags = 0b1
if writer.has_tbs:
self.extra_data_flags |= 0b10
self.uid = random.randint(0, 0xffffffff)
self.language_code = iana2mobi(str(metadata.language[0]))
self.exth_flags = 0b1010000
if writer.opts.mobi_periodical:
self.exth_flags |= 0b1000
self.opts = writer.opts
self.start_offset = writer.start_offset
self.metadata = metadata
self.kuc = 0 if len(resources.records) > 0 else None
@property
def record0(self):
''' We generate the EXTH header and record0 dynamically, to allow other
code to customize various values after build_records() has been
called'''
opts = self.opts
self.exth = build_exth(self.metadata,
prefer_author_sort=opts.prefer_author_sort,
is_periodical=opts.mobi_periodical,
share_not_sync=opts.share_not_sync,
cover_offset=self.cover_offset,
thumbnail_offset=self.thumbnail_offset,
num_of_resources=self.num_of_resources,
kf8_unknown_count=self.kuc, be_kindlegen2=True,
start_offset=self.start_offset, mobi_doctype=self.book_type)
kwargs = {field:getattr(self, field) for field in HEADER_FIELDS}
return MOBIHeader()(**kwargs)
def write(self, outpath):
records = [self.record0] + self.records[1:]
with open(outpath, 'wb') as f:
# Write PalmDB Header
title = ascii_filename(self.full_title.decode('utf-8')).replace(
' ', '_')[:31]
title += (b'\0' * (32 - len(title)))
now = int(time.time())
nrecords = len(records)
f.write(title)
f.write(pack(b'>HHIIIIII', 0, 0, now, now, 0, 0, 0, 0))
f.write(b'BOOKMOBI')
f.write(pack(b'>IIH', (2*nrecords)-1, 0, nrecords))
offset = f.tell() + (8 * nrecords) + 2
for i, record in enumerate(records):
f.write(pack(b'>I', offset))
f.write(b'\0' + pack(b'>I', 2*i)[1:])
offset += len(record)
f.write(b'\0\0')
for rec in records:
f.write(rec)

View File

@ -0,0 +1,417 @@
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import (unicode_literals, division, absolute_import,
print_function)
__license__ = 'GPL v3'
__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import re
from collections import namedtuple
from functools import partial
from lxml import etree
from calibre.ebooks.oeb.base import XHTML_NS
from calibre.constants import ispy3
from calibre.ebooks.mobi.utils import to_base
CHUNK_SIZE = 8192
# References in links are stored with 10 digits
to_href = partial(to_base, base=32, min_num_digits=10)
# Tags to which kindlegen adds the aid attribute
aid_able_tags = {'a', 'abbr', 'address', 'article', 'aside', 'audio', 'b',
'bdo', 'blockquote', 'body', 'button', 'cite', 'code', 'dd', 'del', 'details',
'dfn', 'div', 'dl', 'dt', 'em', 'fieldset', 'figcaption', 'figure', 'footer',
'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'header', 'hgroup', 'i', 'ins', 'kbd',
'label', 'legend', 'li', 'map', 'mark', 'meter', 'nav', 'ol', 'output', 'p',
'pre', 'progress', 'q', 'rp', 'rt', 'samp', 'section', 'select', 'small',
'span', 'strong', 'sub', 'summary', 'sup', 'textarea', 'time', 'ul', 'var',
'video'}
_self_closing_pat = re.compile(bytes(
r'<(?P<tag>%s)(?=[\s/])(?P<arg>[^>]*)/>'%('|'.join(aid_able_tags))),
re.IGNORECASE)
def close_self_closing_tags(raw):
return _self_closing_pat.sub(br'<\g<tag>\g<arg>></\g<tag>>', raw)
def path_to_node(node):
ans = []
parent = node.getparent()
while parent is not None:
ans.append(parent.index(node))
node = parent
parent = parent.getparent()
return tuple(reversed(ans))
def node_from_path(root, path):
parent = root
for idx in path:
parent = parent[idx]
return parent
mychr = chr if ispy3 else unichr
def tostring(raw, **kwargs):
''' lxml *sometimes* represents non-ascii characters as hex entities in
attribute values. I can't figure out exactly what circumstances cause it.
It seems to happen when serializing a part of a larger tree. Since we need
serialization to be the same when serializing full and partial trees, we
manually replace all hex entities with their unicode codepoints. '''
xml_declaration = kwargs.pop('xml_declaration', False)
encoding = kwargs.pop('encoding', 'UTF-8')
kwargs['encoding'] = unicode
kwargs['xml_declaration'] = False
ans = etree.tostring(raw, **kwargs)
if xml_declaration:
ans = '<?xml version="1.0" encoding="%s"?>\n'%encoding + ans
return re.sub(r'&#x([0-9A-Fa-f]+);', lambda m:mychr(int(m.group(1), 16)),
ans).encode(encoding)
class Chunk(object):
def __init__(self, raw, parent_tag):
self.raw = raw
self.starts_tags = []
self.ends_tags = []
self.insert_pos = None
self.parent_tag = parent_tag
self.parent_is_body = False
self.is_last_chunk = False
self.is_first_chunk = False
def __len__(self):
return len(self.raw)
def merge(self, chunk):
self.raw += chunk.raw
self.ends_tags = chunk.ends_tags
def __repr__(self):
return 'Chunk(len=%r insert_pos=%r starts_tags=%r ends_tags=%r)'%(
len(self.raw), self.insert_pos, self.starts_tags, self.ends_tags)
@property
def selector(self):
typ = 'S' if (self.is_last_chunk and not self.parent_is_body) else 'P'
return "%s-//*[@aid='%s']"%(typ, self.parent_tag)
__str__ = __repr__
class Skeleton(object):
def __init__(self, file_number, item, root, chunks):
self.file_number, self.item = file_number, item
self.chunks = chunks
self.skeleton = self.render(root)
self.body_offset = self.skeleton.find('<body')
self.calculate_metrics(root)
self.calculate_insert_positions()
def render(self, root):
raw = tostring(root, xml_declaration=True)
raw = raw.replace(b'<html', bytes('<html xmlns="%s"'%XHTML_NS), 1)
return raw
def calculate_metrics(self, root):
Metric = namedtuple('Metric', 'start end')
self.metrics = {}
for tag in root.xpath('//*[@aid]'):
text = (tag.text or '').encode('utf-8')
raw = tostring(tag, with_tail=True)
start_length = len(raw.partition(b'>')[0]) + len(text) + 1
end_length = len(raw.rpartition(b'<')[-1]) + 1
self.metrics[tag.get('aid')] = Metric(start_length, end_length)
def calculate_insert_positions(self):
pos = self.body_offset
for chunk in self.chunks:
for tag in chunk.starts_tags:
pos += self.metrics[tag].start
chunk.insert_pos = pos
pos += len(chunk)
for tag in chunk.ends_tags:
pos += self.metrics[tag].end
def rebuild(self):
ans = self.skeleton
for chunk in self.chunks:
i = chunk.insert_pos
ans = ans[:i] + chunk.raw + ans[i:]
return ans
def __len__(self):
return len(self.skeleton) + sum([len(x.raw) for x in self.chunks])
@property
def raw_text(self):
return b''.join([self.skeleton] + [x.raw for x in self.chunks])
class Chunker(object):
def __init__(self, oeb, data_func, placeholder_map):
self.oeb, self.log = oeb, oeb.log
self.data = data_func
self.placeholder_map = placeholder_map
self.skeletons = []
# Set this to a list to enable dumping of the original and rebuilt
# html files for debugging
orig_dumps = None
for i, item in enumerate(self.oeb.spine):
root = self.remove_namespaces(self.data(item))
body = root.xpath('//body')[0]
body.tail = '\n'
if orig_dumps is not None:
orig_dumps.append(tostring(root, xml_declaration=True,
with_tail=True))
orig_dumps[-1] = close_self_closing_tags(
orig_dumps[-1].replace(b'<html',
bytes('<html xmlns="%s"'%XHTML_NS), 1))
# First pass: break up document into rendered strings of length no
# more than CHUNK_SIZE
chunks = []
self.step_into_tag(body, chunks)
# Second pass: Merge neighboring small chunks within the same
# skeleton tag so as to have chunks as close to the CHUNK_SIZE as
# possible.
chunks = self.merge_small_chunks(chunks)
# Third pass: Create the skeleton and calculate the insert position
# for all chunks
self.skeletons.append(Skeleton(i, item, root, chunks))
if orig_dumps:
self.dump(orig_dumps)
# Create the SKEL and Chunk tables
self.skel_table = []
self.chunk_table = []
self.create_tables()
# Set internal links
text = b''.join(x.raw_text for x in self.skeletons)
self.text = self.set_internal_links(text)
def remove_namespaces(self, root):
lang = None
for attr, val in root.attrib.iteritems():
if attr.rpartition('}')[-1] == 'lang':
lang = val
# Remove all namespace information from the tree. This means namespaced
# tags have their namespaces removed and all namespace declarations are
# removed. We have to do this manual cloning of the tree as there is no
# other way to remove namespace declarations in lxml. This is done so
# that serialization creates clean HTML 5 markup with no namespaces. We
# insert the XHTML namespace manually after serialization. The
# preceding layers should have removed svg and any other non html
# namespaced tags.
attrib = {'lang':lang} if lang else {}
nroot = etree.Element('html', attrib=attrib)
nroot.text = root.text
nroot.tail = '\n'
for tag in root.iterdescendants(etree.Element):
# We are ignoring all non tag entities in the tree
# like comments and processing instructions, as they make the
# chunking code even harder, for minimal gain.
elem = nroot.makeelement(tag.tag.rpartition('}')[-1],
attrib={k.rpartition('}')[-1]:v for k, v in
tag.attrib.iteritems()})
elem.text, elem.tail = tag.text, tag.tail
parent = node_from_path(nroot, path_to_node(tag.getparent()))
parent.append(elem)
return nroot
def step_into_tag(self, tag, chunks):
aid = tag.get('aid')
is_body = tag.tag == 'body'
first_chunk_idx = len(chunks)
# First handle any text
if tag.text and tag.text.strip(): # Leave pure whitespace in the skel
chunks.extend(self.chunk_up_text(tag.text, aid))
tag.text = None
# Now loop over children
for child in list(tag):
raw = tostring(child, with_tail=False)
raw = close_self_closing_tags(raw)
if len(raw) > CHUNK_SIZE and child.get('aid', None):
self.step_into_tag(child, chunks)
if child.tail and child.tail.strip(): # Leave pure whitespace
chunks.extend(self.chunk_up_text(child.tail, aid))
child.tail = None
else:
if len(raw) > CHUNK_SIZE:
self.log.warn('Tag %s has no aid and a too large chunk'
' size. Adding anyway.'%child.tag)
chunks.append(Chunk(raw, aid))
if child.tail:
chunks.extend(self.chunk_up_text(child.tail, aid))
tag.remove(child)
if len(chunks) <= first_chunk_idx and chunks:
raise ValueError('Stepped into a tag that generated no chunks.')
# Mark the first and last chunks of this tag
if chunks:
chunks[first_chunk_idx].starts_tags.append(aid)
chunks[-1].ends_tags.append(aid)
my_chunks = chunks[first_chunk_idx:]
if my_chunks:
my_chunks[0].is_first_chunk = True
my_chunks[-1].is_last_chunk = True
if is_body:
for chunk in my_chunks:
chunk.parent_is_body = True
def chunk_up_text(self, text, parent_tag):
text = text.encode('utf-8')
ans = []
def split_multibyte_text(raw):
if len(raw) <= CHUNK_SIZE:
return raw, b''
l = raw[:CHUNK_SIZE]
l = l.decode('utf-8', 'ignore').encode('utf-8')
return l, raw[len(l):]
start, rest = split_multibyte_text(text)
ans.append(start)
while rest:
start, rest = split_multibyte_text(rest)
ans.append(b'<span class="AmznBigTextBlock">' + start + '</span>')
return [Chunk(x, parent_tag) for x in ans]
def merge_small_chunks(self, chunks):
ans = chunks[:1]
for chunk in chunks[1:]:
prev = ans[-1]
if (
chunk.starts_tags or # Starts a tag in the skel
len(chunk) + len(prev) > CHUNK_SIZE or # Too large
prev.ends_tags # Prev chunk ended a tag
):
ans.append(chunk)
else:
prev.merge(chunk)
return ans
def create_tables(self):
Skel = namedtuple('Skel',
'file_number name chunk_count start_pos length')
sp = 0
for s in self.skeletons:
s.start_pos = sp
sp += len(s)
self.skel_table = [Skel(s.file_number, 'SKEL%010d'%s.file_number,
len(s.chunks), s.start_pos, len(s.skeleton)) for s in self.skeletons]
Chunk = namedtuple('Chunk',
'insert_pos selector file_number sequence_number start_pos length')
num = 0
for skel in self.skeletons:
cp = 0
for chunk in skel.chunks:
self.chunk_table.append(
Chunk(chunk.insert_pos + skel.start_pos, chunk.selector,
skel.file_number, num, cp, len(chunk.raw)))
cp += len(chunk.raw)
num += 1
def set_internal_links(self, text):
''' Update the internal link placeholders to point to the correct
location, based on the chunk table.'''
# A kindle:pos:fid link contains two base 32 numbers of the form
# XXXX:YYYYYYYYYY
# The first number is an index into the chunk table and the second is
# an offset from the start of the chunk to the start of the tag pointed
# to by the link.
aid_map = {} # Map of aid to (pos, fid)
for match in re.finditer(br'<[^>]+? aid=[\'"]([A-Z0-9]+)[\'"]', text):
offset = match.start()
pos_fid = None
for chunk in self.chunk_table:
if chunk.insert_pos <= offset < chunk.insert_pos + chunk.length:
pos_fid = (chunk.sequence_number, offset-chunk.insert_pos)
break
if chunk.insert_pos > offset:
# This aid is in the skeleton, not in a chunk, so we use
# the chunk immediately after
pos_fid = (chunk.sequence_number, 0)
break
if pos_fid is None:
raise ValueError('Could not find chunk for aid: %r'%
match.group(1))
aid_map[match.group(1)] = pos_fid
self.aid_offset_map = aid_map
def to_placeholder(aid):
pos, fid = aid_map[aid]
pos, fid = to_base(pos, min_num_digits=4), to_href(fid)
return bytes(':off:'.join((pos, fid)))
placeholder_map = {bytes(k):to_placeholder(v) for k, v in
self.placeholder_map.iteritems()}
# Now update the links
def sub(match):
raw = match.group()
pl = match.group(1)
try:
return raw[:-19] + placeholder_map[pl]
except KeyError:
pass
return raw
return re.sub(br'<[^>]+(kindle:pos:fid:0000:off:[0-9A-Za-z]{10})', sub,
text)
def dump(self, orig_dumps):
import tempfile, shutil, os
tdir = os.path.join(tempfile.gettempdir(), 'skeleton')
self.log('Skeletons dumped to:', tdir)
if os.path.exists(tdir):
shutil.rmtree(tdir)
orig = os.path.join(tdir, 'orig')
rebuilt = os.path.join(tdir, 'rebuilt')
chunks = os.path.join(tdir, 'chunks')
for x in (orig, rebuilt, chunks):
os.makedirs(x)
error = False
for i, skeleton in enumerate(self.skeletons):
for j, chunk in enumerate(skeleton.chunks):
with open(os.path.join(chunks, 'file-%d-chunk-%d.html'%(i, j)),
'wb') as f:
f.write(chunk.raw)
oraw, rraw = orig_dumps[i], skeleton.rebuild()
with open(os.path.join(orig, '%04d.html'%i), 'wb') as f:
f.write(oraw)
with open(os.path.join(rebuilt, '%04d.html'%i), 'wb') as f:
f.write(rraw)
if oraw != rraw:
error = True
if error:
raise ValueError('The before and after HTML differs. Run a diff '
'tool on the orig and rebuilt directories')
else:
self.log('Skeleton HTML before and after is identical.')

View File

@ -0,0 +1,109 @@
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import (unicode_literals, division, absolute_import,
print_function)
__license__ = 'GPL v3'
__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
from collections import namedtuple
from functools import partial
from calibre.ebooks.mobi.utils import (RECORD_SIZE, encode_trailing_data,
encode_tbs)
Entry = namedtuple('IndexEntry', 'index start length depth parent '
'first_child last_child title')
Data = namedtuple('Data', 'starts ends completes spans')
def collect_indexing_data(entries, number_of_text_records):
''' For every text record calculate which index entries start, end, span or
are contained within that record.'''
data = []
for i in xrange(number_of_text_records):
record_start, next_record_start = i*RECORD_SIZE, (i+1)*RECORD_SIZE
datum = Data([], [], [], [])
data.append(datum)
for entry in entries:
end = entry.start + entry.length - 1
if (entry.start >= next_record_start or end < record_start):
# This entry does not have any overlap with this record
continue
if (entry.start < record_start and end >= next_record_start):
# This entry spans this record
datum.spans.append(entry)
continue
if (entry.start >= record_start and end < next_record_start):
# This entry is contained in this record
datum.completes.append(entry)
if (entry.start >= record_start and end >= next_record_start):
# This entry starts in this record
datum.starts.append(entry)
continue
if (entry.start < record_start and end < next_record_start):
# This entry ends in this record
datum.ends.append(entry)
for x in datum:
# Should be unnecessary as entries are already in this order, but
# best to be safe.
x.sort(key=lambda x:x.depth)
return data
def generate_tbs_for_flat_index(indexing_data):
ans = []
record_type = 8 # 8 for KF8 0 for MOBI 6
enc = partial(encode_tbs, flag_size=3)
for datum in indexing_data:
tbs = b''
extra = {0b010 : record_type}
if not (datum.starts or datum.ends or datum.completes or datum.spans):
# No index entry touches this record
pass
elif datum.spans:
extra[0b001] = 0
tbs = enc(datum.spans[0].index, extra)
else:
starts, ends, completes = datum[:3]
if (not completes and len(starts) + len(ends) == 1):
# Either has the first or the last index, and no other indices.
node = (starts+ends)[0]
tbs = enc(node.index, extra)
else:
# This record contains the end of an index and
# some complete index entries. Or it contains some complete
# entries and a start. Or it contains an end, a start and
# optionally some completes. In every case, we encode the first
# entry to touch this record and the number of entries
# that touch this record.
nodes = starts + completes + ends
nodes.sort(key=lambda x:x.index)
extra[0b100] = len(nodes)
tbs = enc(nodes[0].index, extra)
ans.append(tbs)
return ans
def apply_trailing_byte_sequences(index_table, records, number_of_text_records):
entries = tuple(Entry(r['index'], r['offset'], r['length'], r['depth'],
r.get('parent', None), r.get('first_child', None), r.get('last_child',
None), r['label']) for r in index_table)
indexing_data = collect_indexing_data(entries, number_of_text_records)
max_depth = max(e['depth'] for e in index_table)
if max_depth > 0:
# TODO: Implement for hierarchical ToCs
tbs = []
else:
tbs = generate_tbs_for_flat_index(indexing_data)
if not tbs:
return False
for i, tbs_bytes in enumerate(tbs):
records[i+1] += encode_trailing_data(tbs_bytes)
return True

Some files were not shown because too many files have changed in this diff Show More