mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Merge from trunk
This commit is contained in:
commit
ee82fdac05
152
Changelog.yaml
152
Changelog.yaml
@ -19,6 +19,158 @@
|
||||
# new recipes:
|
||||
# - title:
|
||||
|
||||
- version: 0.8.49
|
||||
date: 2012-04-27
|
||||
|
||||
new features:
|
||||
- title: "Experimental support for generating Amazon's new KF8 format MOBI files"
|
||||
description: "calibre can now generate Amazon's new KF8 format MOBI files.
|
||||
To turn on this feature, go to Preferences->Tweaks and click Plugin Tweaks. In the box add:
|
||||
test_mobi_output_type = 'both'
|
||||
calibre will now produce MOBI files that have both the old MOBI format and the new KF8 format in them.
|
||||
To learn more about KF8, see: http://www.amazon.com/gp/feature.html?docId=1000729511
|
||||
Note that calibre support for KF8 is still experimental and there will likely be bugs."
|
||||
|
||||
- title: "Upgrade to using cssutils 0.9.9 for CSS parsing. Improved speed and robustness."
|
||||
|
||||
- title: "Show cover size in a tooltip in the conversion dialog"
|
||||
tickets: [986958]
|
||||
|
||||
- title: "Driver for Nook Simple Touch with Glow Light"
|
||||
tickets: [989264]
|
||||
|
||||
bug fixes:
|
||||
- title: "Heuristics: When italicizing words do not operate on words not in between HTML tags."
|
||||
tickets: [986298]
|
||||
|
||||
- title: "Fix (I hope) the bulk metadata download process crashing for some people on OS X when clicking the Yes button to apply the updates."
|
||||
tickets: [986658]
|
||||
|
||||
- title: "Fix tooltip not being updated in the book details panel when pasting in a new cover"
|
||||
tickets: [986958]
|
||||
|
||||
- title: "Cover Browser: Wrap the title on space only, not in between words."
|
||||
tickets: [986516]
|
||||
|
||||
- title: "Edit metadata dialog: If a permission denied error occurs when clicking the next or prev buttons, stay on the current book."
|
||||
tickets: [986903]
|
||||
|
||||
- title: "Fix heuristics not removing unnecessary hyphens from the end of lines."
|
||||
tickets: [822744]
|
||||
|
||||
improved recipes:
|
||||
- Metro Nieuws NL
|
||||
- Der Tagesspiegel
|
||||
|
||||
new recipes:
|
||||
- title: Berria
|
||||
author: Alayn Gortazar
|
||||
|
||||
- title: Sol Haber
|
||||
author: Onur Gungor
|
||||
|
||||
- title: Telam
|
||||
author: Darko Miletic
|
||||
|
||||
- title: Richmond Times-Dispatch
|
||||
author: jde
|
||||
|
||||
- version: 0.8.48
|
||||
date: 2012-04-20
|
||||
|
||||
new features:
|
||||
- title: "Conversion: The search and replace feature has been completely revamped."
|
||||
description: "You can now use any number of search and replace
|
||||
expression, not just three. You can also store and load frequently used
|
||||
sets of search and replace expressions. Also, the wizard generates its
|
||||
preview in a separate process to protect against crashes/memory leaks."
|
||||
tickets: [983476,983484,983478]
|
||||
|
||||
- title: "Support for the new '.azw3' files that Amazon recently started generating. calibre will now detect them as ebooks. It can also view/convert them, if they are DRM free."
|
||||
|
||||
- title: "Drivers for Samsung Galaxy ACE GT-S5830L and HTC One X"
|
||||
tickets: [981185]
|
||||
|
||||
bug fixes:
|
||||
- title: "Get Books: Support the new website design of Barnes & Noble"
|
||||
|
||||
- title: "T1 driver: Fix books sent to SD card sometimes resulting problems when deleted."
|
||||
tickets: [943586]
|
||||
|
||||
- title: "Do not allow author names to be set to blank via the Manage authors function. Blank authors are now automatically set to 'Unknown'"
|
||||
|
||||
- title: "MOBI Output: Handle background color specified on <td> and <tr> in addition to <table> tags."
|
||||
tickets: [980813]
|
||||
|
||||
- title: "MOBI Output: Fix underline style applied to parent element not getting inherited by <a> children."
|
||||
tickets: [985711]
|
||||
|
||||
improved recipes:
|
||||
- xkcd
|
||||
- Metro Nieuws
|
||||
- Calgary Herald
|
||||
- Orlando Sentinel
|
||||
- countryfile
|
||||
- Heise
|
||||
|
||||
new recipes:
|
||||
- title: Various new Polish news sources
|
||||
author: fenuks
|
||||
|
||||
- title: Various Italian news sources
|
||||
author: faber1971
|
||||
|
||||
- title: Jakarta Globe
|
||||
author: rty
|
||||
|
||||
- title: Acim Bilim Dergisi
|
||||
author: thomass
|
||||
|
||||
- version: 0.8.47
|
||||
date: 2012-04-13
|
||||
|
||||
new features:
|
||||
- title: "Conversion pipeline: Add support for all the named entities in the HTML 5 spec."
|
||||
tickets: [976056]
|
||||
|
||||
- title: "Support for viewing and converting the Haodoo PDB ebook format"
|
||||
tickets: [976478]
|
||||
|
||||
- title: "Device driver for Laser EB720"
|
||||
|
||||
bug fixes:
|
||||
- title: "Fix regression in automatic adding in 0.8.46 that broke automatic adding if adding of duplicates is enabled and auto convert is also enabled"
|
||||
tickets: [976336]
|
||||
|
||||
- title: 'Fix "Tags" field in advanced search does not obey regex setting'
|
||||
tickets: [980221]
|
||||
|
||||
- title: "EPUB Input: Automatically extract cover image from simple HTML title page that consists of only a single <img> tag, instead of rendering the page"
|
||||
|
||||
- title: "Prevent errors when both author and author_sort are used in a template for reading metadata from filenames for files on a device"
|
||||
|
||||
- title: "Amazon metadata download: Handle books whose titles start with a bracket."
|
||||
tickets: [976365]
|
||||
|
||||
- title: "Get Books: Fix downloading of purchased books from Baen"
|
||||
tickets: [975929]
|
||||
|
||||
|
||||
improved recipes:
|
||||
- Forbes
|
||||
- Caros Amigos
|
||||
- Trouw
|
||||
- Sun UK
|
||||
- Metro
|
||||
- Daily Mirror
|
||||
|
||||
new recipes:
|
||||
- title: "Melbourne Herald Sun"
|
||||
author: Ray Hartley
|
||||
|
||||
- title: "Editoriali and Zerocalcare"
|
||||
author: faber1971
|
||||
|
||||
- version: 0.8.46
|
||||
date: 2012-04-06
|
||||
|
||||
|
27
recipes/acim_bilim_dergisi.recipe
Normal file
27
recipes/acim_bilim_dergisi.recipe
Normal file
@ -0,0 +1,27 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class AdvancedUserRecipe1334868409(BasicNewsRecipe):
|
||||
title = u'AÇIK BİLİM DERGİSİ'
|
||||
description = ' Aylık çevrimiçi bilim dergisi'
|
||||
__author__ = u'thomass'
|
||||
oldest_article = 30
|
||||
max_articles_per_feed = 300
|
||||
auto_cleanup = True
|
||||
encoding = 'UTF-8'
|
||||
publisher = 'açık bilim'
|
||||
category = 'haber, bilim,TR,dergi'
|
||||
language = 'tr'
|
||||
publication_type = 'magazine '
|
||||
conversion_options = {
|
||||
'tags' : category
|
||||
,'language' : language
|
||||
,'publisher' : publisher
|
||||
,'linearize_tables': True
|
||||
}
|
||||
cover_img_url = 'http://www.acikbilim.com/wp-content/themes/Equilibrium/images/logodene.jpg'
|
||||
masthead_url = 'http://www.acikbilim.com/wp-content/themes/Equilibrium/images/logodene.jpg'
|
||||
|
||||
|
||||
feeds = [(u'Tüm Yayınlar', u'http://www.acikbilim.com/feed')]
|
@ -9,6 +9,7 @@ class Adventure_zone(BasicNewsRecipe):
|
||||
no_stylesheets = True
|
||||
oldest_article = 20
|
||||
max_articles_per_feed = 100
|
||||
index='http://www.adventure-zone.info/fusion/'
|
||||
use_embedded_content=False
|
||||
preprocess_regexps = [(re.compile(r"<td class='capmain'>Komentarze</td>", re.IGNORECASE), lambda m: '')]
|
||||
remove_tags_before= dict(name='td', attrs={'class':'main-bg'})
|
||||
@ -45,6 +46,19 @@ class Adventure_zone(BasicNewsRecipe):
|
||||
skip_tag = skip_tag.findAll(name='a')
|
||||
for r in skip_tag:
|
||||
if r.strong:
|
||||
word=r.strong.string
|
||||
if word and (('zapowied' in word) or ('recenzj' in word) or ('solucj' in word)):
|
||||
return self.index_to_soup('http://www.adventure-zone.info/fusion/print.php?type=A&item'+r['href'][r['href'].find('article_id')+7:], raw=True)
|
||||
word=r.strong.string.lower()
|
||||
if word and (('zapowied' in word) or ('recenzj' in word) or ('solucj' in word) or ('poradnik' in word)):
|
||||
return self.index_to_soup('http://www.adventure-zone.info/fusion/print.php?type=A&item'+r['href'][r['href'].find('article_id')+7:], raw=True)
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
footer=soup.find(attrs={'class':'news-footer middle-border'})
|
||||
if footer and len(footer('a'))>=2:
|
||||
footer('a')[1].extract()
|
||||
for item in soup.findAll(style=True):
|
||||
del item['style']
|
||||
for a in soup('a'):
|
||||
if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']:
|
||||
a['href']=self.index + a['href']
|
||||
return soup
|
||||
|
||||
|
@ -68,4 +68,7 @@ class Benchmark_pl(BasicNewsRecipe):
|
||||
self.image_article(soup, soup.body)
|
||||
else:
|
||||
self.append_page(soup, soup.body)
|
||||
for a in soup('a'):
|
||||
if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']:
|
||||
a['href']=self.INDEX + a['href']
|
||||
return soup
|
||||
|
44
recipes/berria.recipe
Normal file
44
recipes/berria.recipe
Normal file
@ -0,0 +1,44 @@
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2012, Alayn Gortazar <zutoin at gmail dot com>'
|
||||
'''
|
||||
www.berria.info
|
||||
'''
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
|
||||
class Berria(BasicNewsRecipe):
|
||||
title = 'Berria'
|
||||
__author__ = 'Alayn Gortazar'
|
||||
description = 'Euskal Herriko euskarazko egunkaria'
|
||||
publisher = 'Berria'
|
||||
category = 'news, politics, sports, Basque Country'
|
||||
oldest_article = 2
|
||||
max_articles_per_feed = 100
|
||||
no_stylesheets = True
|
||||
use_embedded_content = False
|
||||
language = 'eu'
|
||||
remove_empty_feeds = True
|
||||
masthead_url = 'http://upload.wikimedia.org/wikipedia/commons/thumb/6/6a/Berria_Logo.svg/400px-Berria_Logo.svg.png'
|
||||
|
||||
keep_only_tags = [
|
||||
dict(id='goiburua'),
|
||||
dict(name='div', attrs={'class':['ber_ikus']}),
|
||||
dict(name='section', attrs={'class':'ber_ikus'})
|
||||
]
|
||||
remove_tags = [
|
||||
dict(name='a', attrs={'class':'iruzkinak'}),
|
||||
dict(name='div', attrs={'class':'laguntzaileak'})
|
||||
]
|
||||
|
||||
extra_css = '#goiburua{font-weight: bold} .zintiloa{font-size: small} .sarrera{color:#666} .titularra{font-size: x-large} .sarrera{font-weight: bold} .argazoin{color:#666; font-size: small}'
|
||||
|
||||
feeds = [
|
||||
(u'Edizioa jarraia', u'http://berria.info/rss/ediziojarraia.xml'),
|
||||
(u'Iritzia', u'http://berria.info/rss/iritzia.xml'),
|
||||
(u'Euskal Herria', u'http://berria.info/rss/euskalherria.xml'),
|
||||
(u'Ekonomia', u'http://berria.info/rss/ekonomia.xml'),
|
||||
(u'Mundua', u'http://berria.info/rss/mundua.xml'),
|
||||
(u'Kirola', u'http://berria.info/rss/kirola.xml'),
|
||||
(u'Plaza', u'http://berria.info/rss/plaza.xml')
|
||||
]
|
@ -1,220 +1,35 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
|
||||
'''
|
||||
www.canada.com
|
||||
'''
|
||||
|
||||
import re
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup
|
||||
|
||||
|
||||
class CanWestPaper(BasicNewsRecipe):
|
||||
|
||||
# un-comment the following four lines for the Victoria Times Colonist
|
||||
## title = u'Victoria Times Colonist'
|
||||
## url_prefix = 'http://www.timescolonist.com'
|
||||
## description = u'News from Victoria, BC'
|
||||
## fp_tag = 'CAN_TC'
|
||||
|
||||
# un-comment the following four lines for the Vancouver Province
|
||||
## title = u'Vancouver Province'
|
||||
## url_prefix = 'http://www.theprovince.com'
|
||||
## description = u'News from Vancouver, BC'
|
||||
## fp_tag = 'CAN_VP'
|
||||
|
||||
# un-comment the following four lines for the Vancouver Sun
|
||||
## title = u'Vancouver Sun'
|
||||
## url_prefix = 'http://www.vancouversun.com'
|
||||
## description = u'News from Vancouver, BC'
|
||||
## fp_tag = 'CAN_VS'
|
||||
|
||||
# un-comment the following four lines for the Edmonton Journal
|
||||
## title = u'Edmonton Journal'
|
||||
## url_prefix = 'http://www.edmontonjournal.com'
|
||||
## description = u'News from Edmonton, AB'
|
||||
## fp_tag = 'CAN_EJ'
|
||||
|
||||
# un-comment the following four lines for the Calgary Herald
|
||||
title = u'Calgary Herald'
|
||||
url_prefix = 'http://www.calgaryherald.com'
|
||||
description = u'News from Calgary, AB'
|
||||
fp_tag = 'CAN_CH'
|
||||
|
||||
# un-comment the following four lines for the Regina Leader-Post
|
||||
## title = u'Regina Leader-Post'
|
||||
## url_prefix = 'http://www.leaderpost.com'
|
||||
## description = u'News from Regina, SK'
|
||||
## fp_tag = ''
|
||||
|
||||
# un-comment the following four lines for the Saskatoon Star-Phoenix
|
||||
## title = u'Saskatoon Star-Phoenix'
|
||||
## url_prefix = 'http://www.thestarphoenix.com'
|
||||
## description = u'News from Saskatoon, SK'
|
||||
## fp_tag = ''
|
||||
|
||||
# un-comment the following four lines for the Windsor Star
|
||||
## title = u'Windsor Star'
|
||||
## url_prefix = 'http://www.windsorstar.com'
|
||||
## description = u'News from Windsor, ON'
|
||||
## fp_tag = 'CAN_'
|
||||
|
||||
# un-comment the following four lines for the Ottawa Citizen
|
||||
## title = u'Ottawa Citizen'
|
||||
## url_prefix = 'http://www.ottawacitizen.com'
|
||||
## description = u'News from Ottawa, ON'
|
||||
## fp_tag = 'CAN_OC'
|
||||
|
||||
# un-comment the following four lines for the Montreal Gazette
|
||||
## title = u'Montreal Gazette'
|
||||
## url_prefix = 'http://www.montrealgazette.com'
|
||||
## description = u'News from Montreal, QC'
|
||||
## fp_tag = 'CAN_MG'
|
||||
|
||||
|
||||
language = 'en_CA'
|
||||
__author__ = 'Nick Redding'
|
||||
no_stylesheets = True
|
||||
timefmt = ' [%b %d]'
|
||||
extra_css = '''
|
||||
.timestamp { font-size:xx-small; display: block; }
|
||||
#storyheader { font-size: medium; }
|
||||
#storyheader h1 { font-size: x-large; }
|
||||
#storyheader h2 { font-size: large; font-style: italic; }
|
||||
.byline { font-size:xx-small; }
|
||||
#photocaption { font-size: small; font-style: italic }
|
||||
#photocredit { font-size: xx-small; }'''
|
||||
keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'})]
|
||||
remove_tags = [{'class':'comments'},
|
||||
dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}),
|
||||
dict(name='div', attrs={'class':'viewmore'}),dict(name='li', attrs={'class':'email'}),
|
||||
dict(name='div', attrs={'class':'story_tool_hr'}),dict(name='div', attrs={'class':'clear'}),
|
||||
dict(name='div', attrs={'class':'story_tool'}),dict(name='div', attrs={'class':'copyright'}),
|
||||
dict(name='div', attrs={'class':'rule_grey_solid'}),
|
||||
dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})]
|
||||
|
||||
def get_cover_url(self):
|
||||
from datetime import timedelta, date
|
||||
if self.fp_tag=='':
|
||||
return None
|
||||
cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str(date.today().day)+'/lg/'+self.fp_tag+'.jpg'
|
||||
br = BasicNewsRecipe.get_browser()
|
||||
daysback=1
|
||||
try:
|
||||
br.open(cover)
|
||||
except:
|
||||
while daysback<7:
|
||||
cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str((date.today() - timedelta(days=daysback)).day)+'/lg/'+self.fp_tag+'.jpg'
|
||||
br = BasicNewsRecipe.get_browser()
|
||||
try:
|
||||
br.open(cover)
|
||||
except:
|
||||
daysback = daysback+1
|
||||
continue
|
||||
break
|
||||
if daysback==7:
|
||||
self.log("\nCover unavailable")
|
||||
cover = None
|
||||
return cover
|
||||
|
||||
def fixChars(self,string):
|
||||
# Replace lsquo (\x91)
|
||||
fixed = re.sub("\x91","‘",string)
|
||||
# Replace rsquo (\x92)
|
||||
fixed = re.sub("\x92","’",fixed)
|
||||
# Replace ldquo (\x93)
|
||||
fixed = re.sub("\x93","“",fixed)
|
||||
# Replace rdquo (\x94)
|
||||
fixed = re.sub("\x94","”",fixed)
|
||||
# Replace ndash (\x96)
|
||||
fixed = re.sub("\x96","–",fixed)
|
||||
# Replace mdash (\x97)
|
||||
fixed = re.sub("\x97","—",fixed)
|
||||
fixed = re.sub("’","’",fixed)
|
||||
return fixed
|
||||
|
||||
def massageNCXText(self, description):
|
||||
# Kindle TOC descriptions won't render certain characters
|
||||
if description:
|
||||
massaged = unicode(BeautifulStoneSoup(description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES))
|
||||
# Replace '&' with '&'
|
||||
massaged = re.sub("&","&", massaged)
|
||||
return self.fixChars(massaged)
|
||||
else:
|
||||
return description
|
||||
|
||||
def populate_article_metadata(self, article, soup, first):
|
||||
if first:
|
||||
picdiv = soup.find('body').find('img')
|
||||
if picdiv is not None:
|
||||
self.add_toc_thumbnail(article,re.sub(r'links\\link\d+\\','',picdiv['src']))
|
||||
xtitle = article.text_summary.strip()
|
||||
if len(xtitle) == 0:
|
||||
desc = soup.find('meta',attrs={'property':'og:description'})
|
||||
if desc is not None:
|
||||
article.summary = article.text_summary = desc['content']
|
||||
|
||||
def strip_anchors(self,soup):
|
||||
paras = soup.findAll(True)
|
||||
for para in paras:
|
||||
aTags = para.findAll('a')
|
||||
for a in aTags:
|
||||
if a.img is None:
|
||||
a.replaceWith(a.renderContents().decode('cp1252','replace'))
|
||||
return soup
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
return self.strip_anchors(soup)
|
||||
|
||||
|
||||
|
||||
def parse_index(self):
|
||||
soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html')
|
||||
|
||||
articles = {}
|
||||
key = 'News'
|
||||
ans = ['News']
|
||||
|
||||
# Find each instance of class="sectiontitle", class="featurecontent"
|
||||
for divtag in soup.findAll('div',attrs={'class' : ["section_title02","featurecontent"]}):
|
||||
#self.log(" div class = %s" % divtag['class'])
|
||||
if divtag['class'].startswith('section_title'):
|
||||
# div contains section title
|
||||
if not divtag.h3:
|
||||
continue
|
||||
key = self.tag_to_string(divtag.h3,False)
|
||||
ans.append(key)
|
||||
self.log("Section name %s" % key)
|
||||
continue
|
||||
# div contains article data
|
||||
h1tag = divtag.find('h1')
|
||||
if not h1tag:
|
||||
continue
|
||||
atag = h1tag.find('a',href=True)
|
||||
if not atag:
|
||||
continue
|
||||
url = self.url_prefix+'/news/todays-paper/'+atag['href']
|
||||
#self.log("Section %s" % key)
|
||||
#self.log("url %s" % url)
|
||||
title = self.tag_to_string(atag,False)
|
||||
#self.log("title %s" % title)
|
||||
pubdate = ''
|
||||
description = ''
|
||||
ptag = divtag.find('p');
|
||||
if ptag:
|
||||
description = self.tag_to_string(ptag,False)
|
||||
#self.log("description %s" % description)
|
||||
author = ''
|
||||
autag = divtag.find('h4')
|
||||
if autag:
|
||||
author = self.tag_to_string(autag,False)
|
||||
#self.log("author %s" % author)
|
||||
if not articles.has_key(key):
|
||||
articles[key] = []
|
||||
articles[key].append(dict(title=title,url=url,date=pubdate,description=description,author=author,content=''))
|
||||
|
||||
ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
|
||||
return ans
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class CalgaryHerald(BasicNewsRecipe):
|
||||
title = u'Calgary Herald'
|
||||
oldest_article = 3
|
||||
max_articles_per_feed = 100
|
||||
|
||||
feeds = [
|
||||
(u'News', u'http://rss.canada.com/get/?F233'),
|
||||
(u'Calgary', u'http://www.calgaryherald.com/scripts/sp6query.aspx?catalog=cahr&tags=keyword|calgary&output=rss?link=http%3a%2f%2fwww.calgaryherald'),
|
||||
(u'Alberta', u'http://www.calgaryherald.com/scripts/Sp6Query.aspx?catalog=CAHR&tags=Keyword|Alberta&output=rss?link=http%3A%2F%2Fwww.calgaryherald.com%2Fnews%2Falberta%2Findex.html'),
|
||||
(u'Politics', u'http://rss.canada.com/get/?F7551'),
|
||||
(u'National', u'http://rss.canada.com/get/?F7552'),
|
||||
(u'World', u'http://rss.canada.com/get/?F7553'),
|
||||
]
|
||||
__author__ = 'rty'
|
||||
pubisher = 'Calgary Herald'
|
||||
description = 'Calgary, Alberta, Canada'
|
||||
category = 'News, Calgary, Alberta, Canada'
|
||||
|
||||
|
||||
remove_javascript = True
|
||||
use_embedded_content = False
|
||||
no_stylesheets = True
|
||||
language = 'en_CA'
|
||||
encoding = 'utf-8'
|
||||
conversion_options = {'linearize_tables':True}
|
||||
##masthead_url = 'http://www.calgaryherald.com/index.html'
|
||||
keep_only_tags = [
|
||||
dict(name='div', attrs={'id':'storyheader'}),
|
||||
dict(name='div', attrs={'id':'storycontent'})
|
||||
|
||||
]
|
||||
remove_tags_after = {'class':"story_tool_hr"}
|
||||
|
||||
|
17
recipes/camera_di_commercio_di_bari.recipe
Normal file
17
recipes/camera_di_commercio_di_bari.recipe
Normal file
@ -0,0 +1,17 @@
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class AdvancedUserRecipe1331729727(BasicNewsRecipe):
|
||||
title = u'Camera di Commercio di Bari'
|
||||
oldest_article = 7
|
||||
__author__ = 'faber1971'
|
||||
description = 'News from the Chamber of Commerce of Bari'
|
||||
language = 'it'
|
||||
max_articles_per_feed = 100
|
||||
auto_cleanup = True
|
||||
masthead_url = 'http://www.ba.camcom.it/grafica/layout-bordo/logo_camcom_bari.png'
|
||||
feeds = [(u'Camera di Commercio di Bari', u'http://feed43.com/4715147488845101.xml')]
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2012, faber1971'
|
||||
__version__ = 'v1.00'
|
||||
__date__ = '17, April 2012'
|
@ -6,6 +6,7 @@ class CD_Action(BasicNewsRecipe):
|
||||
description = 'cdaction.pl - polish games magazine site'
|
||||
category = 'games'
|
||||
language = 'pl'
|
||||
index='http://www.cdaction.pl'
|
||||
oldest_article = 8
|
||||
max_articles_per_feed = 100
|
||||
no_stylesheets= True
|
||||
@ -17,4 +18,10 @@ class CD_Action(BasicNewsRecipe):
|
||||
def get_cover_url(self):
|
||||
soup = self.index_to_soup('http://www.cdaction.pl/magazyn/')
|
||||
self.cover_url='http://www.cdaction.pl'+ soup.find(id='wspolnik').div.a['href']
|
||||
return getattr(self, 'cover_url', self.cover_url)
|
||||
return getattr(self, 'cover_url', self.cover_url)
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
for a in soup('a'):
|
||||
if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']:
|
||||
a['href']=self.index + a['href']
|
||||
return soup
|
@ -1,11 +1,12 @@
|
||||
from calibre import browser
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class AdvancedUserRecipe1325006965(BasicNewsRecipe):
|
||||
title = u'Countryfile.com'
|
||||
cover_url = 'http://www.buysubscriptions.com/static_content/the-immediate/en/images/covers/CFIL_maxi.jpg'
|
||||
#cover_url = 'http://www.countryfile.com/sites/default/files/imagecache/160px_wide/cover/2_1.jpg'
|
||||
__author__ = 'Dave Asbury'
|
||||
description = 'The official website of Countryfile Magazine'
|
||||
# last updated 29/1/12
|
||||
# last updated 15/4/12
|
||||
language = 'en_GB'
|
||||
oldest_article = 30
|
||||
max_articles_per_feed = 25
|
||||
@ -13,7 +14,23 @@ class AdvancedUserRecipe1325006965(BasicNewsRecipe):
|
||||
no_stylesheets = True
|
||||
auto_cleanup = True
|
||||
#articles_are_obfuscated = True
|
||||
def get_cover_url(self):
|
||||
soup = self.index_to_soup('http://www.countryfile.com/')
|
||||
cov = soup.find(attrs={'class' : 'imagecache imagecache-160px_wide imagecache-linked imagecache-160px_wide_linked'})
|
||||
#print '******** ',cov,' ***'
|
||||
cov2 = str(cov)
|
||||
cov2=cov2[124:-90]
|
||||
#print '******** ',cov2,' ***'
|
||||
|
||||
# try to get cover - if can't get known cover
|
||||
br = browser()
|
||||
br.set_handle_redirect(False)
|
||||
try:
|
||||
br.open_novisit(cov2)
|
||||
cover_url = cov2
|
||||
except:
|
||||
cover_url = 'http://www.countryfile.com/sites/default/files/imagecache/160px_wide/cover/2_1.jpg'
|
||||
return cover_url
|
||||
remove_tags = [
|
||||
# dict(attrs={'class' : ['player']}),
|
||||
|
||||
|
@ -1,20 +1,21 @@
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
import re
|
||||
import mechanize
|
||||
class AdvancedUserRecipe1306061239(BasicNewsRecipe):
|
||||
title = u'The Daily Mirror'
|
||||
description = 'News as provide by The Daily Mirror -UK'
|
||||
|
||||
__author__ = 'Dave Asbury'
|
||||
# last updated 11/2/12
|
||||
# last updated 7/4/12
|
||||
language = 'en_GB'
|
||||
|
||||
cover_url = 'http://yookeo.com/screens/m/i/mirror.co.uk.jpg'
|
||||
#cover_url = 'http://yookeo.com/screens/m/i/mirror.co.uk.jpg'
|
||||
|
||||
masthead_url = 'http://www.nmauk.co.uk/nma/images/daily_mirror.gif'
|
||||
|
||||
|
||||
oldest_article = 1
|
||||
max_articles_per_feed = 5
|
||||
max_articles_per_feed = 10
|
||||
remove_empty_feeds = True
|
||||
remove_javascript = True
|
||||
no_stylesheets = True
|
||||
@ -75,3 +76,28 @@ class AdvancedUserRecipe1306061239(BasicNewsRecipe):
|
||||
img { display:block}
|
||||
'''
|
||||
|
||||
def get_cover_url(self):
|
||||
soup = self.index_to_soup('http://www.politicshome.com/uk/latest_frontpage.html')
|
||||
# look for the block containing the mirror button and url
|
||||
cov = soup.find(attrs={'style' : 'background-image: url(http://www.politicshome.com/images/sources/source_frontpage_button_92.gif);'})
|
||||
cov2 = str(cov)
|
||||
cov2='http://www.politicshome.com'+cov2[9:-142]
|
||||
#cov2 now contains url of the page containing pic
|
||||
soup = self.index_to_soup(cov2)
|
||||
cov = soup.find(attrs={'id' : 'large'})
|
||||
cov2 = str(cov)
|
||||
cov2=cov2[27:-18]
|
||||
#cov2 now is pic url, now go back to original function
|
||||
br = mechanize.Browser()
|
||||
br.set_handle_redirect(False)
|
||||
try:
|
||||
br.open_novisit(cov2)
|
||||
cover_url = cov2
|
||||
except:
|
||||
cover_url = 'http://yookeo.com/screens/m/i/mirror.co.uk.jpg'
|
||||
|
||||
#cover_url = cov2
|
||||
#cover_url = 'http://www.thesun.co.uk/img/global/new-masthead-logo.png'
|
||||
return cover_url
|
||||
|
||||
|
||||
|
@ -11,6 +11,7 @@ class Dobreprogramy_pl(BasicNewsRecipe):
|
||||
cover_url = 'http://userlogos.org/files/logos/Karmody/dobreprogramy_01.png'
|
||||
description = u'Aktualności i blogi z dobreprogramy.pl'
|
||||
encoding = 'utf-8'
|
||||
index='http://www.dobreprogramy.pl/'
|
||||
no_stylesheets = True
|
||||
language = 'pl'
|
||||
extra_css = '.title {font-size:22px;}'
|
||||
@ -22,3 +23,10 @@ class Dobreprogramy_pl(BasicNewsRecipe):
|
||||
#remove_tags = [dict(name='div', attrs={'class':['komentarze', 'block', 'portalInfo', 'menuBar', 'topBar']})]
|
||||
feeds = [(u'Aktualności', 'http://feeds.feedburner.com/dobreprogramy/Aktualnosci'),
|
||||
('Blogi', 'http://feeds.feedburner.com/dobreprogramy/BlogCzytelnikow')]
|
||||
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
for a in soup('a'):
|
||||
if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']:
|
||||
a['href']=self.index + a['href']
|
||||
return soup
|
@ -7,6 +7,7 @@ class Dzieje(BasicNewsRecipe):
|
||||
cover_url = 'http://www.dzieje.pl/sites/default/files/dzieje_logo.png'
|
||||
category = 'history'
|
||||
language = 'pl'
|
||||
index='http://dzieje.pl'
|
||||
oldest_article = 8
|
||||
max_articles_per_feed = 100
|
||||
remove_javascript=True
|
||||
@ -15,3 +16,10 @@ class Dzieje(BasicNewsRecipe):
|
||||
remove_tags_after= dict(id='dogory')
|
||||
remove_tags=[dict(id='dogory')]
|
||||
feeds = [(u'Dzieje', u'http://dzieje.pl/rss.xml')]
|
||||
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
for a in soup('a'):
|
||||
if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']:
|
||||
a['href']=self.index + a['href']
|
||||
return soup
|
@ -21,3 +21,8 @@ class eioba(BasicNewsRecipe):
|
||||
(u'Rozrywka', u'http://www.eioba.pl/feed/categories/10.xml'),
|
||||
(u'Rożne', u'http://www.eioba.pl/feed/categories/9.xml')
|
||||
]
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
for item in soup.findAll(style=True):
|
||||
del item['style']
|
||||
return soup
|
||||
|
@ -7,6 +7,7 @@ class eMuzyka(BasicNewsRecipe):
|
||||
description = u'Emuzyka to największa i najpopularniejsza strona o muzyce w Polsce'
|
||||
category = 'music'
|
||||
language = 'pl'
|
||||
index='http://www.emuzyka.pl'
|
||||
cover_url='http://s.emuzyka.pl/img/emuzyka_invert_small.jpg'
|
||||
no_stylesheets = True
|
||||
oldest_article = 7
|
||||
@ -14,3 +15,9 @@ class eMuzyka(BasicNewsRecipe):
|
||||
keep_only_tags=[dict(name='div', attrs={'id':'news_container'}), dict(name='h3'), dict(name='div', attrs={'class':'review_text'})]
|
||||
remove_tags=[dict(name='span', attrs={'id':'date'})]
|
||||
feeds = [(u'Aktualno\u015bci', u'http://www.emuzyka.pl/rss.php?f=1'), (u'Recenzje', u'http://www.emuzyka.pl/rss.php?f=2')]
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
for a in soup('a'):
|
||||
if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']:
|
||||
a['href']=self.index + a['href']
|
||||
return soup
|
@ -7,7 +7,7 @@ class AdvancedUserRecipe1325006965(BasicNewsRecipe):
|
||||
# cover_url = 'http://profile.ak.fbcdn.net/hprofile-ak-snc4/373529_38324934806_64930243_n.jpg'
|
||||
masthead_url = 'http://www.fhm.com/App_Resources/Images/Site/re-design/logo.gif'
|
||||
__author__ = 'Dave Asbury'
|
||||
# last updated 17/3/12
|
||||
# last updated 14/4/12
|
||||
language = 'en_GB'
|
||||
oldest_article = 28
|
||||
max_articles_per_feed = 12
|
||||
@ -28,7 +28,8 @@ class AdvancedUserRecipe1325006965(BasicNewsRecipe):
|
||||
|
||||
#]
|
||||
feeds = [
|
||||
(u'From the Homepage',u'http://feed43.com/8053226782885416.xml'),
|
||||
(u'From the Homepage',u'http://feed43.com/0032328550253453.xml'),
|
||||
#http://feed43.com/8053226782885416.xml'),
|
||||
(u'Funny - The Very Best Of The Internet',u'http://feed43.com/4538510106331565.xml'),
|
||||
(u'Upgrade',u'http://feed43.com/0877305847443234.xml'),
|
||||
#(u'The Final Countdown', u'http://feed43.com/3576106158530118.xml'),
|
||||
|
@ -7,6 +7,7 @@ class Filmweb_pl(BasicNewsRecipe):
|
||||
cover_url = 'http://userlogos.org/files/logos/crudus/filmweb.png'
|
||||
category = 'movies'
|
||||
language = 'pl'
|
||||
index='http://www.filmweb.pl'
|
||||
oldest_article = 8
|
||||
max_articles_per_feed = 100
|
||||
no_stylesheets= True
|
||||
@ -39,3 +40,9 @@ class Filmweb_pl(BasicNewsRecipe):
|
||||
self.log.warn(skip_tag)
|
||||
return self.index_to_soup(skip_tag['href'], raw=True)
|
||||
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
for a in soup('a'):
|
||||
if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']:
|
||||
a['href']=self.index + a['href']
|
||||
return soup
|
@ -1,39 +1,49 @@
|
||||
from calibre.ebooks.BeautifulSoup import BeautifulSoup
|
||||
import re
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class Forbes(BasicNewsRecipe):
|
||||
title = u'Forbes'
|
||||
description = 'Business and Financial News'
|
||||
__author__ = 'Darko Miletic'
|
||||
__author__ = 'Kovid Goyal'
|
||||
oldest_article = 30
|
||||
max_articles_per_feed = 100
|
||||
max_articles_per_feed = 20
|
||||
language = 'en'
|
||||
encoding = 'utf-8'
|
||||
recursions = 1
|
||||
|
||||
no_stylesheets = True
|
||||
html2lrf_options = ['--base-font-size', '10']
|
||||
|
||||
cover_url = u'http://www.forbes.com/media/current_covers/forbes_120_160.gif'
|
||||
|
||||
feeds = [(u'Latest', u'http://www.forbes.com/news/index.xml'),
|
||||
(u'Most Popular', u'http://www.forbes.com/feeds/popstories.xml'),
|
||||
(u'Most Emailed', u'http://www.forbes.com/feeds/mostemailed.xml'),
|
||||
(u'Faces', u'http://www.forbes.com/facesscan/index.xml'),
|
||||
(u'Technology', u'http://www.forbes.com/technology/index.xml'),
|
||||
(u'Personal Tech', u'http://www.forbes.com/personaltech/index.xml'),
|
||||
(u'Wireless', u'http://www.forbes.com/wireless/index.xml'),
|
||||
(u'Business', u'http://www.forbes.com/business/index.xml'),
|
||||
(u'Sports Money', u'http://www.forbes.com/sportsmoney/index.xml'),
|
||||
(u'Sports', u'http://www.forbes.com/forbeslife/sports/index.xml'),
|
||||
(u'Vehicles', u'http://www.forbes.com/forbeslife/vehicles/index.xml'),
|
||||
(u'Leadership', u'http://www.forbes.com/leadership/index.xml'),
|
||||
(u'Careers', u'http://www.forbes.com/leadership/careers/index.xml'),
|
||||
(u'Compensation', u'http://www.forbes.com/leadership/compensation/index.xml'),
|
||||
(u'Managing', u'http://www.forbes.com/leadership/managing/index.xml')]
|
||||
|
||||
def print_version(self, url):
|
||||
raw = self.browser.open(url).read()
|
||||
soup = BeautifulSoup(raw.decode('latin1', 'replace'))
|
||||
print_link = soup.find('a', {'onclick':"s_linkTrackVars='prop18';s_linkType='o';s_linkName='Print';if(typeof(globalPageName)!='undefined')s_prop18=globalPageName;s_lnk=s_co(this);s_gs(s_account);"})
|
||||
if print_link is None:
|
||||
return ''
|
||||
return 'http://www.forbes.com' + print_link['href']
|
||||
feeds = [(u'Latest', u'http://www.forbes.com/news/index.xml'),
|
||||
(u'Most Popular', u'http://www.forbes.com/feeds/popstories.xml'),
|
||||
(u'Technology', u'http://www.forbes.com/technology/index.xml'),
|
||||
(u'Business', u'http://www.forbes.com/business/index.xml'),
|
||||
(u'Sports Money', u'http://www.forbes.com/sportsmoney/index.xml'),
|
||||
(u'Leadership', u'http://www.forbes.com/leadership/index.xml'),]
|
||||
|
||||
keep_only_tags = \
|
||||
{'class':lambda x: x and (set(x.split()) & {'body', 'pagination',
|
||||
'articleHead', 'article_head'})}
|
||||
remove_tags_before = {'name':'h1'}
|
||||
remove_tags = [
|
||||
{'class':['comment_bug', 'engagement_block',
|
||||
'video_promo_block', 'article_actions']},
|
||||
{'id':'comments'}
|
||||
]
|
||||
|
||||
def is_link_wanted(self, url, tag):
|
||||
ans = re.match(r'http://.*/[2-9]/', url) is not None
|
||||
if ans:
|
||||
self.log('Following multipage link: %s'%url)
|
||||
return ans
|
||||
|
||||
def postprocess_html(self, soup, first_fetch):
|
||||
for pag in soup.findAll(True, 'pagination'):
|
||||
pag.extract()
|
||||
if not first_fetch:
|
||||
h1 = soup.find('h1')
|
||||
if h1 is not None:
|
||||
h1.extract()
|
||||
return soup
|
||||
|
||||
|
16
recipes/fotoblogia_pl.recipe
Normal file
16
recipes/fotoblogia_pl.recipe
Normal file
@ -0,0 +1,16 @@
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class Fotoblogia_pl(BasicNewsRecipe):
|
||||
title = u'Fotoblogia.pl'
|
||||
__author__ = 'fenuks'
|
||||
category = 'photography'
|
||||
language = 'pl'
|
||||
masthead_url = 'http://img.interia.pl/komputery/nimg/u/0/fotoblogia21.jpg'
|
||||
cover_url= 'http://fotoblogia.pl/images/2009/03/fotoblogia2.jpg'
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 100
|
||||
no_stylesheets = True
|
||||
use_embedded_content = False
|
||||
keep_only_tags=[dict(name='div', attrs={'class':'post-view post-standard'})]
|
||||
remove_tags=[dict(attrs={'class':['external fotoblogia', 'categories', 'tags']})]
|
||||
feeds = [(u'Wszystko', u'http://fotoblogia.pl/feed/rss2')]
|
@ -6,16 +6,24 @@ class Gameplay_pl(BasicNewsRecipe):
|
||||
description = u'gameplay.pl - serwis o naszych zainteresowaniach, grach, filmach, książkach, muzyce, fotografii i konsolach.'
|
||||
category = 'games, movies, books, music'
|
||||
language = 'pl'
|
||||
index='http://gameplay.pl'
|
||||
masthead_url= 'http://gameplay.pl/img/gpy_top_logo.png'
|
||||
cover_url= 'http://gameplay.pl/img/gpy_top_logo.png'
|
||||
max_articles_per_feed = 100
|
||||
remove_javascript= True
|
||||
no_stylesheets= True
|
||||
keep_only_tags=[dict(name='div', attrs={'class':['news_endpage_tit', 'news']})]
|
||||
remove_tags=[dict(name='div', attrs={'class':['galeria', 'noedit center im']})]
|
||||
remove_tags=[dict(name='div', attrs={'class':['galeria', 'noedit center im', 'news_list', 'news_list_autor', 'stop_bot', 'tagi']}), dict(attrs={'usemap':'#map'})]
|
||||
feeds = [(u'Wiadomo\u015bci', u'http://gameplay.pl/rss/')]
|
||||
|
||||
def image_url_processor(self, baseurl, url):
|
||||
if 'http' not in url:
|
||||
return 'http://gameplay.pl'+ url[2:]
|
||||
else:
|
||||
return url
|
||||
return url
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
for a in soup('a'):
|
||||
if a.has_key('href') and '../' in a['href']:
|
||||
a['href']=self.index + a['href'][2:]
|
||||
return soup
|
@ -9,6 +9,7 @@ class Gildia(BasicNewsRecipe):
|
||||
language = 'pl'
|
||||
oldest_article = 8
|
||||
max_articles_per_feed = 100
|
||||
remove_empty_feeds=True
|
||||
no_stylesheets=True
|
||||
remove_tags=[dict(name='div', attrs={'class':'backlink'}), dict(name='div', attrs={'class':'im_img'}), dict(name='div', attrs={'class':'addthis_toolbox addthis_default_style'})]
|
||||
keep_only_tags=dict(name='div', attrs={'class':'widetext'})
|
||||
@ -24,3 +25,16 @@ class Gildia(BasicNewsRecipe):
|
||||
self.log.warn('odnosnik')
|
||||
self.log.warn(link['href'])
|
||||
return self.index_to_soup(link['href'], raw=True)
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
for a in soup('a'):
|
||||
if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']:
|
||||
if '/gry/' in a['href']:
|
||||
a['href']='http://www.gry.gildia.pl' + a['href']
|
||||
elif u'książk' in soup.title.string.lower() or u'komiks' in soup.title.string.lower():
|
||||
a['href']='http://www.literatura.gildia.pl' + a['href']
|
||||
elif u'komiks' in soup.title.string.lower():
|
||||
a['href']='http://www.literatura.gildia.pl' + a['href']
|
||||
else:
|
||||
a['href']='http://www.gildia.pl' + a['href']
|
||||
return soup
|
||||
|
@ -7,6 +7,7 @@ class Gram_pl(BasicNewsRecipe):
|
||||
category = 'games'
|
||||
language = 'pl'
|
||||
oldest_article = 8
|
||||
index='http://www.gram.pl'
|
||||
max_articles_per_feed = 100
|
||||
no_stylesheets= True
|
||||
extra_css = 'h2 {font-style: italic; font-size:20px;} .picbox div {float: left;}'
|
||||
@ -52,4 +53,7 @@ class Gram_pl(BasicNewsRecipe):
|
||||
tag=soup.findAll(name='div', attrs={'class':'picbox'})
|
||||
for t in tag:
|
||||
t['style']='float: left;'
|
||||
for a in soup('a'):
|
||||
if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']:
|
||||
a['href']=self.index + a['href']
|
||||
return soup
|
@ -59,6 +59,7 @@ class heiseDe(BasicNewsRecipe):
|
||||
dict(name='span', attrs={'class':'rsaquo'}),
|
||||
dict(name='div', attrs={'class':'news_logo'}),
|
||||
dict(name='div', attrs={'class':'bcadv ISI_IGNORE'}),
|
||||
dict(name='div', attrs={'class':'navi_top_container'}),
|
||||
dict(name='p', attrs={'class':'news_option'}),
|
||||
dict(name='p', attrs={'class':'news_navi'}),
|
||||
dict(name='div', attrs={'class':'news_foren'})]
|
||||
@ -69,3 +70,5 @@ class heiseDe(BasicNewsRecipe):
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
20
recipes/historia_news.recipe
Normal file
20
recipes/historia_news.recipe
Normal file
@ -0,0 +1,20 @@
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class historia_news(BasicNewsRecipe):
|
||||
title = u'historia-news'
|
||||
__author__ = 'fenuks'
|
||||
description = u'Historia-news to portal dla ludzi kochających historię. Najnowsze newsy z historii bliższej i dalszej, archeologii, paleontologii oraz ciekawostki i podcasty z historii kultury, sportu, motoryzacji i inne.'
|
||||
masthead_url = 'http://historia-news.pl/templates/hajak4/images/header.jpg'
|
||||
cover_url= 'http://www.historia-news.pl/templates/hajak4/images/header.jpg'
|
||||
category = 'history'
|
||||
language = 'pl'
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 100
|
||||
no_stylesheets = True
|
||||
remove_empty_feeds = True
|
||||
remove_tags=[dict(name='form'), dict(name='img', attrs={'alt':'Print'}), dict(attrs={'class':['commbutt', 'cpr']}), dict(id=['plusone', 'facebook'])]
|
||||
feeds = [(u'Wiadomo\u015bci', u'http://historia-news.pl/wiadomoci.feed?type=rss'), (u'Artyku\u0142y', u'http://historia-news.pl/artykuy.feed?type=rss')]
|
||||
|
||||
|
||||
def print_version(self, url):
|
||||
return url + '?tmpl=component&print=1&layout=default&page='
|
BIN
recipes/icons/fotoblogia_pl.png
Normal file
BIN
recipes/icons/fotoblogia_pl.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 379 B |
BIN
recipes/icons/historia_news.png
Normal file
BIN
recipes/icons/historia_news.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 833 B |
BIN
recipes/icons/swiat_obrazu.png
Normal file
BIN
recipes/icons/swiat_obrazu.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 1006 B |
BIN
recipes/icons/telam.png
Normal file
BIN
recipes/icons/telam.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 1.9 KiB |
@ -8,6 +8,7 @@ class in4(BasicNewsRecipe):
|
||||
description = u'Serwis Informacyjny - Aktualnosci, recenzje'
|
||||
category = 'IT'
|
||||
language = 'pl'
|
||||
index='http://www.in4.pl/'
|
||||
#cover_url= 'http://www.in4.pl/recenzje/337/in4pl.jpg'
|
||||
no_stylesheets = True
|
||||
remove_empty_feeds = True
|
||||
@ -39,6 +40,7 @@ class in4(BasicNewsRecipe):
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
self.append_page(soup, soup.body)
|
||||
for a in soup('a'):
|
||||
if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']:
|
||||
a['href']=self.index + a['href']
|
||||
return soup
|
||||
|
||||
|
||||
|
@ -8,6 +8,7 @@ class INFRA(BasicNewsRecipe):
|
||||
description = u'Serwis Informacyjny INFRA - UFO, Zjawiska Paranormalne, Duchy, Tajemnice świata.'
|
||||
cover_url = 'http://npn.nazwa.pl/templates/ja_teline_ii/images/logo.jpg'
|
||||
category = 'UFO'
|
||||
index='http://infra.org.pl'
|
||||
language = 'pl'
|
||||
max_articles_per_feed = 100
|
||||
no_stylesheers=True
|
||||
@ -15,3 +16,11 @@ class INFRA(BasicNewsRecipe):
|
||||
remove_tags_after=dict(attrs={'class':'pagenav'})
|
||||
remove_tags=[dict(attrs={'class':'pagenav'})]
|
||||
feeds = [(u'Najnowsze wiadomo\u015bci', u'http://www.infra.org.pl/index.php?option=com_rd_rss&id=1')]
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
for item in soup.findAll(style=True):
|
||||
del item['style']
|
||||
for a in soup('a'):
|
||||
if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']:
|
||||
a['href']=self.index + a['href']
|
||||
return soup
|
34
recipes/jakarta_globe.recipe
Normal file
34
recipes/jakarta_globe.recipe
Normal file
@ -0,0 +1,34 @@
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class JakartaGlobe(BasicNewsRecipe):
|
||||
title = u'Jakarta Globe'
|
||||
oldest_article = 3
|
||||
max_articles_per_feed = 100
|
||||
|
||||
feeds = [
|
||||
(u'News', u'http://www.thejakartaglobe.com/pages/getrss/getrss-news.php'),
|
||||
(u'Business', u'http://www.thejakartaglobe.com/pages/getrss/getrss-business.php'),
|
||||
(u'Technology', u'http://www.thejakartaglobe.com/pages/getrss/getrss-tech.php'),
|
||||
(u'My Jakarta', u'http://www.thejakartaglobe.com/pages/getrss/getrss-myjakarta.php'),
|
||||
(u'International', u'http://www.thejakartaglobe.com/pages/getrss/getrss-international.php'),
|
||||
(u'Life and Times', u'http://www.thejakartaglobe.com/pages/getrss/getrss-lifeandtimes.php'),
|
||||
]
|
||||
__author__ = 'rty'
|
||||
pubisher = 'JakartaGlobe.com'
|
||||
description = 'JakartaGlobe, Indonesia, Newspaper'
|
||||
category = 'News, Indonesia'
|
||||
|
||||
|
||||
remove_javascript = True
|
||||
use_embedded_content = False
|
||||
no_stylesheets = True
|
||||
language = 'en_ID'
|
||||
encoding = 'utf-8'
|
||||
conversion_options = {'linearize_tables':True}
|
||||
masthead_url = 'http://www.thejakartaglobe.com/pages/2010/images/jak-globe-logo.jpg'
|
||||
keep_only_tags = [
|
||||
dict(name='div', attrs={'class':'story'}),
|
||||
dict(name='span', attrs={'class':'headline'}),
|
||||
dict(name='div', attrs={'class':'story'}),
|
||||
dict(name='p', attrs={'id':'bodytext'})
|
||||
]
|
@ -1,5 +1,6 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
from calibre.ebooks.BeautifulSoup import BeautifulSoup
|
||||
|
||||
class Konflikty(BasicNewsRecipe):
|
||||
title = u'Konflikty Zbrojne'
|
||||
@ -10,6 +11,23 @@ class Konflikty(BasicNewsRecipe):
|
||||
category='military, history'
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 100
|
||||
auto_cleanup = True
|
||||
no_stylesheets = True
|
||||
keep_only_tags=[dict(attrs={'class':['title1', 'image']}), dict(id='body')]
|
||||
|
||||
feeds = [(u'Aktualności', u'http://www.konflikty.pl/rss_aktualnosci_10.xml'), (u'Artyku\u0142y', u'http://www.konflikty.pl/rss_artykuly_10.xml'), (u'Relacje', u'http://www.konflikty.pl/rss_relacje_10.xml'), (u'Recenzje', u'http://www.konflikty.pl/rss_recenzje_10.xml')]
|
||||
feeds = [(u'Aktualności', u'http://www.konflikty.pl/rss_aktualnosci_10.xml'),
|
||||
(u'Artyku\u0142y', u'http://www.konflikty.pl/rss_artykuly_10.xml'),
|
||||
(u'Historia', u'http://www.konflikty.pl/rss_historia_10.xml'),
|
||||
(u'Militaria', u'http://www.konflikty.pl/rss_militaria_10.xml'),
|
||||
(u'Relacje', u'http://www.konflikty.pl/rss_relacje_10.xml'),
|
||||
(u'Recenzje', u'http://www.konflikty.pl/rss_recenzje_10.xml'),
|
||||
(u'Teksty źródłowe', u'http://www.konflikty.pl/rss_tekstyzrodlowe_10.xml')]
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
for item in soup.findAll(style=True):
|
||||
del item['style']
|
||||
for image in soup.findAll(name='a', attrs={'class':'image'}):
|
||||
if image.img and image.img.has_key('alt'):
|
||||
image.name='div'
|
||||
pos = len(image.contents)
|
||||
image.insert(pos, BeautifulSoup('<p style="font-style:italic;">'+image.img['alt']+'</p>'))
|
||||
return soup
|
||||
|
12
recipes/liberatorio_politico.recipe
Normal file
12
recipes/liberatorio_politico.recipe
Normal file
@ -0,0 +1,12 @@
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class AdvancedUserRecipe1334649829(BasicNewsRecipe):
|
||||
title = u'Liberatorio Politico'
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 100
|
||||
auto_cleanup = True
|
||||
masthead_url = 'http://liberatorio.altervista.org/wp-content/uploads/2012/01/Testata-LIBERATORIO-Altervista1.jpg'
|
||||
feeds = [(u'Liberatorio Politico', u'http://liberatorio.altervista.org/feed/')]
|
||||
__author__ = 'faber1971'
|
||||
description = 'Inquiry journalism - a blog on Molfetta, Land of Bari, Apulia and Italy - v1.00 (07, April 2012)'
|
||||
language = 'it'
|
50
recipes/limes.recipe
Normal file
50
recipes/limes.recipe
Normal file
@ -0,0 +1,50 @@
|
||||
#!/usr/bin/env python
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2012, faber1971'
|
||||
__version__ = 'v1.00'
|
||||
__date__ = '16, April 2012'
|
||||
__description__ = 'Geopolitical Italian magazine'
|
||||
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class Limes(BasicNewsRecipe):
|
||||
description = 'Italian weekly magazine'
|
||||
__author__ = 'faber1971'
|
||||
|
||||
cover_url = 'http://temi.repubblica.it/UserFiles/limes/Image/Loghi/logo-limes.gif'
|
||||
title = 'Limes'
|
||||
category = 'Geopolitical news'
|
||||
|
||||
language = 'it'
|
||||
# encoding = 'cp1252'
|
||||
timefmt = '[%a, %d %b, %Y]'
|
||||
|
||||
oldest_article = 16
|
||||
max_articles_per_feed = 100
|
||||
use_embedded_content = False
|
||||
recursion = 10
|
||||
|
||||
remove_javascript = True
|
||||
no_stylesheets = True
|
||||
masthead_url = 'http://temi.repubblica.it/UserFiles/limes/Image/Loghi/logo-limes.gif'
|
||||
|
||||
feeds = [
|
||||
(u'Limes', u'http://temi.repubblica.it/limes/feed/')
|
||||
]
|
||||
|
||||
|
||||
|
||||
keep_only_tags = [
|
||||
dict(name='div', attrs={'class':['testo','copertina','occhiello','firma','didascalia','content-second-right','detail-articles','titolo-local','generic-articles']}),
|
||||
dict(name='div', attrs={'class':['generic-articles','summary','detail-articles']}),
|
||||
dict(name='div', attrs={'id':['content-second-right','content2']})
|
||||
]
|
||||
|
||||
remove_tags = [
|
||||
dict(name='div',attrs={'class':['servizi','aggiungi','label-web','bottom-mobile','box-abbonamenti','box-cerca','big','little','stampaweb']}),
|
||||
dict(name='div',attrs={'id':['topheader','header','navigation-new','navigation','content-second-left','menutext']}),
|
||||
dict(name='ul',attrs={'id':'user-utility'}),
|
||||
dict(name=['script','noscript','iframe'])
|
||||
]
|
||||
|
@ -1,11 +1,13 @@
|
||||
__license__ = 'GPL v3'
|
||||
__author__ = 'faber1971'
|
||||
description = 'Collection of Italian marketing websites - v1.04 (17, March 2012)'
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class AdvancedUserRecipe1327062445(BasicNewsRecipe):
|
||||
title = u'Marketing Magazine'
|
||||
description = 'Collection of Italian marketing websites'
|
||||
language = 'it'
|
||||
__author__ = 'faber1971'
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 100
|
||||
auto_cleanup = True
|
||||
@ -16,4 +18,4 @@ class AdvancedUserRecipe1327062445(BasicNewsRecipe):
|
||||
dict(name='ul', attrs={'id':'ads0'})
|
||||
]
|
||||
masthead_url = 'http://www.simrendeogun.com/wp-content/uploads/2011/06/New-Marketing-Magazine-Logo.jpg'
|
||||
feeds = [(u'My Marketing', u'http://feed43.com/0537744466058428.xml'), (u'My Marketing_', u'http://feed43.com/8126723074604845.xml'), (u'Venturini', u'http://robertoventurini.blogspot.com/feeds/posts/default?alt=rss'), (u'Ninja Marketing', u'http://feeds.feedburner.com/NinjaMarketing'), (u'Comunitàzione', u'http://www.comunitazione.it/feed/novita.asp'), (u'Brandforum news', u'http://www.brandforum.it/rss/news'), (u'Brandforum papers', u'http://www.brandforum.it/rss/papers'), (u'MarketingArena', u'http://feeds.feedburner.com/marketingarena'), (u'minimarketing', u'http://feeds.feedburner.com/minimarketingit'), (u'Marketing Journal', u'http://feeds.feedburner.com/marketingjournal/jPwA'), (u'Disambiguando', u'http://giovannacosenza.wordpress.com/feed/')]
|
||||
feeds = [(u'My Marketing', u'http://feed43.com/0537744466058428.xml'), (u'My Marketing_', u'http://feed43.com/8126723074604845.xml'), (u'MarketingArena', u'http://feeds.feedburner.com/marketingarena'), (u'Marketing Journal', u'http://feeds.feedburner.com/marketingjournal/jPwA'), (u'Venturini', u'http://robertoventurini.blogspot.com/feeds/posts/default?alt=rss'), (u'Brandforum news', u'http://www.brandforum.it/rss/news'), (u'Brandforum papers', u'http://www.brandforum.it/rss/papers'), (u'minimarketing', u'http://feeds.feedburner.com/minimarketingit'), (u'[4]marketing.biz', u'http://feeds.feedburner.com/4marketing'), (u'Ninja Marketing', u'http://feeds.feedburner.com/NinjaMarketing'), (u'Bloguerrilla', u'http://feeds.feedburner.com/Bloguerrilla'), (u'Nonconvenzionale', u'http://feeds.feedburner.com/nonconvenzionale'), (u'Comunitàzione', u'http://www.comunitazione.it/feed/novita.asp'), (u'Disambiguando', u'http://giovannacosenza.wordpress.com/feed/')]
|
||||
|
@ -3,25 +3,6 @@ from calibre.web.feeds.news import BasicNewsRecipe
|
||||
import re
|
||||
from calibre.utils.magick import Image
|
||||
from BeautifulSoup import BeautifulSoup
|
||||
try:
|
||||
from calibre_plugins.drMerry.debug import debuglogger as mlog
|
||||
print 'drMerry debuglogger found, debug options can be used'
|
||||
from calibre_plugins.drMerry.stats import statslogger as mstat
|
||||
print 'drMerry stats tracker found, stat can be tracked'
|
||||
mlog.setLoglevel(1) #-1 == no log; 0 for normal output
|
||||
mstat.calculateStats(False) #track stats (to track stats loglevel must be > 0
|
||||
KEEPSTATS = mstat.keepmystats()
|
||||
SHOWDEBUG0 = mlog.showdebuglevel(0)
|
||||
SHOWDEBUG1 = mlog.showdebuglevel(1)
|
||||
SHOWDEBUG2 = mlog.showdebuglevel(2)
|
||||
except:
|
||||
#print 'drMerry debuglogger not found, skipping debug options'
|
||||
SHOWDEBUG0 = False
|
||||
SHOWDEBUG1 = False
|
||||
SHOWDEBUG2 = False
|
||||
KEEPSTATS = False
|
||||
|
||||
#print ('level0: %s\nlevel1: %s\nlevel2: %s' % (SHOWDEBUG0,SHOWDEBUG1,SHOWDEBUG2))
|
||||
|
||||
''' Version 1.2, updated cover image to match the changed website.
|
||||
added info date on title
|
||||
@ -43,80 +24,75 @@ except:
|
||||
extended timeout from 2 to 10
|
||||
changed oldest article from 10 to 1.2
|
||||
changed max articles from 15 to 25
|
||||
Version 1.9.1 18-04-2012
|
||||
removed some debug settings
|
||||
updated code to match new metro-layout
|
||||
Version 1.9.2 24-04-2012
|
||||
updated code to match new metro-layout
|
||||
Version 1.9.3 25-04-2012
|
||||
Changed a lot of custom code into calibre code as the default code of calibre has become much faster since the first version fo this recipe
|
||||
Added new feeds
|
||||
Updated css
|
||||
Changed order of regex to speedup proces
|
||||
'''
|
||||
|
||||
class AdvancedUserRecipe1306097511(BasicNewsRecipe):
|
||||
title = u'Metro Nieuws NL'
|
||||
oldest_article = 1.2
|
||||
max_articles_per_feed = 25
|
||||
__author__ = u'DrMerry'
|
||||
description = u'Metro Nederland'
|
||||
language = u'nl'
|
||||
simultaneous_downloads = 3
|
||||
__author__ = u'DrMerry'
|
||||
description = u'Metro Nederland'
|
||||
language = u'nl'
|
||||
simultaneous_downloads = 5
|
||||
masthead_url = 'http://blog.metronieuws.nl/wp-content/themes/metro/images/header.gif'
|
||||
timeout = 10
|
||||
center_navbar = True
|
||||
timefmt = ' [%A, %d %b %Y]'
|
||||
center_navbar = True
|
||||
timefmt = ' [%A, %d %b %Y]'
|
||||
no_stylesheets = True
|
||||
remove_javascript = True
|
||||
remove_empty_feeds = True
|
||||
cover_url = 'http://www.oldreadmetro.com/img/en/metroholland/last/1/small.jpg'
|
||||
cover_url = 'http://www.oldreadmetro.com/img/en/metroholland/last/1/small.jpg'
|
||||
publication_type = 'newspaper'
|
||||
encoding = 'utf-8'
|
||||
remove_attributes = ['style', 'font', 'width', 'height']
|
||||
encoding = 'utf-8'
|
||||
remove_attributes = ['style', 'font', 'width', 'height', 'itemtype', 'itemprop', 'itemscope']#, 'href']
|
||||
use_embedded_content = False
|
||||
conversion_options = {
|
||||
'authors' : 'Metro Nederland & calibre & DrMerry',
|
||||
'author_sort' : 'Metro Nederland & calibre & DrMerry',
|
||||
'publisher' : 'DrMerry/Metro Nederland'
|
||||
}
|
||||
extra_css = 'body {padding:5px 0px; background:#fff;font-size: 13px;}\
|
||||
#date, div.share-and-byline div.byline div.text div.title, div.share-and-byline div.byline div.text div.name {clear: both;margin-bottom: 10px;font-size:0.5em; color: #616262;}\
|
||||
.article-box-fact.module-title {clear:both;padding: 8px 0;color: #24763b;font-family: arial, sans-serif;font-size: 14px;font-weight: bold;}\
|
||||
h1.title {color: #000000;font-size: 44px;padding-bottom: 10px;font-weight: 300;} h2.subtitle {font-size: 13px;font-weight: 700;padding-bottom: 10px;}\
|
||||
.article-body p{padding-bottom:10px;}div.column-1-3{margin-left: 19px;padding-right: 9px;}\
|
||||
div.column-1-2 {display: inline;padding-right: 7px;}\
|
||||
p.article-image-caption {font-size: 12px;font-weight: 300;color: #616262;margin-top: 5px;} \
|
||||
p.article-image-caption .credits {font-style: italic;font-size: 10px;}\
|
||||
div.article-image-caption {width: 246px;margin-bottom: 5px;margin-left: 10px;}\
|
||||
div.article-image-caption-2column {margin-bottom: 10px;width: 373px;} div.article-image-caption-3column {}\
|
||||
img {border:0px; padding:2px;} hr.merryhr {width:30%; border-width:0px; color:green; margin-left:5px; background-color: green} div.column-3 {background-color:#eee; width:50%; margin:2px; float:right; padding:2px;} div.column-3 module-title {border: 1px solid #aaa} div.article-box-fact div.subtitle {font-weight:bold; color:green;}'
|
||||
extra_css = 'body{font-size:1em;padding:5px 0}body,a,h2{background-color:#fff;text-decoration:none;color:#000}#date,div.byline,p.article-image-caption .credits,.calibrenavbar{font-size:.5em}.article-box-fact.module-title,#date,div.byline{clear:both}.article-box-fact.module-title{margin:8px 0}.article-box-fact.module-title,h2{font-size:1.1em}h1.title{font-size:1.4em}h1.title,.article-body p,div.article-image-caption-2column,div.article-image-caption-3column,#date,div.byline{margin-bottom:.6em}div.article-box-fact div.subtitle,.article-box-fact.module-title,h1.title,p.article-image-caption{font-weight:700}div.column-1-3{margin-left:19px}div.column-1-2{display:inline}div.column-1-2,div.column-1-3{margin-right:7px}p.article-image-caption{font-size:.6em;margin-top:5px}p.article-image-caption,#date,div.byline{color:#616262}p.article-image-caption .credits{font-style:italic}div.article-image-caption{width:246px}div.article-image-caption-2column{width:373px}div.column-3{background-color:#eee;float:right;width:50%}div.column-3 module-title{border:1px solid #aaa}div.article-box-fact div.subtitle,.article-box-fact.module-title{color:#24763b}div.byline{border-top:2px solid #24763b}div.column-3,img,div.column-3,p.small,div.article-image-caption{margin:.5em}img,p.small,.column1,h2{border:0;padding:0}.column1,h1,h2{margin:0}'
|
||||
|
||||
|
||||
preprocess_regexps = [
|
||||
(re.compile(r'<img[^>]+top-line[^>]+>', re.DOTALL|re.IGNORECASE),
|
||||
lambda match: '<hr class="merryhr" />'),
|
||||
(re.compile(r'(<img[^>]+metronieuws\.nl/[^>]+/templates/[^>]+jpe?g[^>]+>|metronieuws\.nl/internal\-roxen\-unit\.gif)', re.DOTALL|re.IGNORECASE),
|
||||
lambda match: ''),
|
||||
(re.compile(r'( |\s|<img[^>]+metronieuws\.nl/([^>]+/templates/[^>]+\.jpe?g|internal\-roxen\-unit\.gif)[^>]+>)', re.DOTALL|re.IGNORECASE),lambda match: ' '),
|
||||
#(re.compile(r'( |\s)+', re.DOTALL|re.IGNORECASE),lambda match:' '),
|
||||
#(re.compile(r'<(a |/a)[^>]*>', re.DOTALL|re.IGNORECASE),lambda match:'')
|
||||
#(re.compile('(</?)h2', re.DOTALL|re.IGNORECASE),lambda match:'\1em')
|
||||
]
|
||||
|
||||
remove_tags_before= dict(id='date')
|
||||
remove_tags_after = [dict(name='div', attrs={'class':['column-1-3','gallery-text']})]#id='share-and-byline')]
|
||||
|
||||
remove_tags = [
|
||||
dict(name=['iframe','script','noscript','style']),
|
||||
dict(name='div', attrs={'class':[re.compile('column-[14]-5'),'col-179 ','col-373 ','clear','ad','navigation',re.compile('share-tools(-top)?'),'tools','metroCommentFormWrap','article-tools-below-title','related-links','padding-top-15',re.compile('^promo.*?$'),'teaser-component',re.compile('fb(-comments|_iframe_widget)')]}),
|
||||
dict(id=['column-1-5-bottom','column-4-5',re.compile('^ad(\d+|adcomp.*?)?$'),'sidebar',re.compile('^article-\d'),'comments','gallery-1']),
|
||||
dict(name='a', attrs={'name':'comments'}),
|
||||
#dict(name='div', attrs={'data-href'}),
|
||||
dict(name='img', attrs={'class':'top-line'}),
|
||||
dict(attrs={'style':re.compile('^(.*(display\s?:\s?none|img-mask|white)\s?;?.*)$'),'title':'volledig scherm'})]
|
||||
|
||||
'''removed by before/after:
|
||||
id:
|
||||
column-1-5-top,'hidden_div','footer',
|
||||
class:
|
||||
'header',re.compile('^footer-[a-zA-Z0-9]+$),'header-links',
|
||||
'''
|
||||
def preprocess_html(self, soup):
|
||||
if SHOWDEBUG0 == True:
|
||||
mlog.setdefaults()
|
||||
mlog.addTextAndTag(['Show debug = on with level'], [str(mlog.debuglevel)])
|
||||
if KEEPSTATS == True:
|
||||
mlog.addDebug('Stats will be calculated')
|
||||
else:
|
||||
mlog.addTextAndTag(['Stats won\'t be calculated\nTo be enabled, stats must be true, currently','and debug level must be 1 or higher, currently'],[mstat.dokeepmystats, mlog.debuglevel])
|
||||
mlog.showDebug()
|
||||
myProcess = MerryProcess()
|
||||
myProcess.moveTitleAndAuthor(soup)
|
||||
myProcess.removeUnwantedTags(soup)
|
||||
return soup
|
||||
|
||||
def postprocess_html(self, soup, first):
|
||||
myProcess = MerryProcess()
|
||||
myProcess.optimizeLayout(soup)
|
||||
if SHOWDEBUG0 == True:
|
||||
if KEEPSTATS == True:
|
||||
statinfo = 'generated stats:'
|
||||
statinfo += str(mstat.stats(mstat.statslist))
|
||||
print statinfo
|
||||
statinfo = 'generated stats (for removed tags):'
|
||||
statinfo += str(mstat.stats(mstat.removedtagslist))
|
||||
print statinfo
|
||||
#show all Debug info we forgot to report
|
||||
#Using print to be sure that this text will not be added at the end of the log.
|
||||
print '\n!!!!!unreported messages:\n(should be empty)\n'
|
||||
mlog.showDebug()
|
||||
return soup
|
||||
|
||||
feeds = [
|
||||
@ -128,295 +104,109 @@ class AdvancedUserRecipe1306097511(BasicNewsRecipe):
|
||||
(u'Buitenland', u'http://www.metronieuws.nl/rss.xml?c=1277377288-4'),
|
||||
(u'Columns', u'http://www.metronieuws.nl/rss.xml?c=1277377288-17'),
|
||||
(u'Entertainment', u'http://www.metronieuws.nl/rss.xml?c=1277377288-2'),
|
||||
(u'Dot', u'http://www.metronieuws.nl/rss.xml?c=1283166782-12'),
|
||||
(u'Strips',u'http://www.metronieuws.nl/rss.xml?c=1325037714-0'),
|
||||
(u'Tech', u'http://www.metronieuws.nl/rss.xml?c=1283166782-12'),
|
||||
(u'Familie', u'http://www.metronieuws.nl/rss.xml?c=1283166782-9'),
|
||||
(u'Blogs', u'http://www.metronieuws.nl/rss.xml?c=1295586825-6'),
|
||||
(u'Reizen', u'http://www.metronieuws.nl/rss.xml?c=1277377288-13'),
|
||||
(u'Carrière', u'http://www.metronieuws.nl/rss.xml?c=1278070988-1'),
|
||||
(u'Carrière', u'http://www.metronieuws.nl/rss.xml?c=1278070988-1'),
|
||||
(u'Wetenschap',u'http://www.metronieuws.nl/rss.xml?c=1303088437-0'),
|
||||
(u'Planeet',u'http://www.metronieuws.nl/rss.xml?c=1277377288-14'),
|
||||
(u'Gezondheid',u'http://www.metronieuws.nl/rss.xml?c=1277377288-15'),
|
||||
(u'Sport', u'http://www.metronieuws.nl/rss.xml?c=1277377288-12')
|
||||
]
|
||||
|
||||
class MerryPreProcess():
|
||||
def replacePictures(self, soup):
|
||||
#to be implemented
|
||||
return soup
|
||||
|
||||
def optimizePicture(self,soup):
|
||||
if SHOWDEBUG0 == True:
|
||||
mlog.addDebug('start image optimize')
|
||||
for tag in soup.findAll(lambda tag: tag.name.lower()=='img' and tag.has_key('src')):
|
||||
iurl = tag['src']
|
||||
img = Image()
|
||||
img.open(iurl)
|
||||
img.trim(0)
|
||||
img.save(iurl)
|
||||
if SHOWDEBUG0 == True:
|
||||
mlog.addDebug('Images optimized')
|
||||
mlog.showDebug()
|
||||
try:
|
||||
iurl = tag['src']
|
||||
img = Image()
|
||||
img.open(iurl)
|
||||
img.trim(0)
|
||||
img.save(iurl)
|
||||
except:
|
||||
print '\n!!image optimize failed!!\n'
|
||||
continue
|
||||
return soup
|
||||
|
||||
class MerryExtract():
|
||||
def safeRemovePart(self, killingSoup, soupIsArray):
|
||||
if killingSoup and not killingSoup == None:
|
||||
if SHOWDEBUG2 == True:
|
||||
mlog.addTextAndTag(['items to remove'],[killingSoup])
|
||||
try:
|
||||
if soupIsArray == True:
|
||||
for killer in killingSoup:
|
||||
killer.extract()
|
||||
else:
|
||||
killingSoup.extract()
|
||||
if SHOWDEBUG1 == True:
|
||||
mlog.addDebug('tag extracted')
|
||||
mlog.showDebug()
|
||||
if KEEPSTATS == True:
|
||||
try:
|
||||
mstat.addstat(mstat.removedtagslist,str(killingSoup.name))
|
||||
except:
|
||||
mstat.addstat(mstat.removedtagslist,'unknown')
|
||||
except:
|
||||
if SHOWDEBUG1 == True:
|
||||
mlog.addDebug('tag extraction failed')
|
||||
mlog.showDebug()
|
||||
if KEEPSTATS == True:
|
||||
mstat.addstat(mstat.removedtagslist,'exception')
|
||||
return False
|
||||
else:
|
||||
return False
|
||||
return killingSoup
|
||||
|
||||
class MerryReplace():
|
||||
myKiller = MerryExtract()
|
||||
def replaceATag(self, soup):
|
||||
anchors = []
|
||||
anchors = soup.findAll('a')
|
||||
if anchors and not (anchors == None or anchors == []):
|
||||
try:
|
||||
for link in anchors:
|
||||
# print str(link)
|
||||
if link and not link == None:
|
||||
# print ('type: %s'%(str(type(link))))
|
||||
# print ('link: %s' % (link))
|
||||
myParent = link.parent
|
||||
# print str('parent: %s'%(myParent))
|
||||
try:
|
||||
myIndex = link.parent.index(link)
|
||||
hasIndex = True
|
||||
except:
|
||||
myIndex = 0
|
||||
hasIndex = False
|
||||
# print str('index %s'%(myIndex))
|
||||
if not link.string == None:
|
||||
# print 'link=notnone'
|
||||
if hasIndex == True:
|
||||
myParent.insert(myIndex, link.string)
|
||||
else:
|
||||
myParent.append(link.string)
|
||||
else:
|
||||
# print 'link=none'
|
||||
myParent.insert(myIndex, link.contents)
|
||||
self.myKiller.safeRemovePart(link, False)
|
||||
else:
|
||||
notshown = 'tag received is empty' # print
|
||||
except:
|
||||
notshown = 'tag received is empty' # print
|
||||
notshown
|
||||
return soup
|
||||
|
||||
class MerryProcess(BeautifulSoup):
|
||||
myKiller = MerryExtract()
|
||||
myReplacer = MerryReplace()
|
||||
myPrepare = MerryPreProcess()
|
||||
|
||||
def optimizeLayout(self,soup):
|
||||
self.myPrepare.optimizePicture(soup)
|
||||
if SHOWDEBUG0 == True:
|
||||
mlog.addDebug('End of Optimize Layout')
|
||||
mlog.showDebug()
|
||||
return soup
|
||||
|
||||
def insertFacts(self, soup):
|
||||
allfacts = soup.findAll('div', {'class':re.compile('^article-box-fact.*$')})
|
||||
if SHOWDEBUG0 == True:
|
||||
mlog.addTextAndTag(['allfacts'],[allfacts])
|
||||
mlog.showDebug()
|
||||
thefactpart = re.compile('^article-box-fact.*$')
|
||||
allfacts = soup.findAll('div', {'class':thefactpart})
|
||||
if allfacts and not allfacts == None:
|
||||
allfactsparent = soup.find('div', {'class':re.compile('^article-box-fact.*$')}).parent
|
||||
if SHOWDEBUG0 == True:
|
||||
mlog.addTextAndTag(['allfactsparent'],[allfactsparent])
|
||||
mlog.showDebug()
|
||||
allfactsparent = soup.find('div', {'class':thefactpart}).parent
|
||||
for part in allfactsparent:
|
||||
if not part in allfacts:
|
||||
if SHOWDEBUG0 == True:
|
||||
mlog.addTextAndTag(['FOUND A non-fact'],[part])
|
||||
mlog.showDebug()
|
||||
self.myKiller.safeRemovePart(part, True)
|
||||
if SHOWDEBUG1 == True:
|
||||
mlog.addTextAndTag(['New All Facts'],[allfacts])
|
||||
mlog.showDebug()
|
||||
articlefacts = soup.find('div', {'class':'article-box-fact column'})
|
||||
errorOccured=False
|
||||
if (articlefacts and not articlefacts==None):
|
||||
try:
|
||||
contenttag = soup.find('div', {'class':'article-body'})
|
||||
if SHOWDEBUG0 == True:
|
||||
mlog.addTextAndTag(['curcontag'],[contenttag])
|
||||
mlog.showDebug()
|
||||
foundrighttag = False
|
||||
if contenttag and not contenttag == None:
|
||||
foundrighttag = True
|
||||
if SHOWDEBUG0 == True:
|
||||
if errorOccured == False:
|
||||
mlog.addTextAndTag(['type','curcontag (in while)'],[type(contenttag),contenttag])
|
||||
else:
|
||||
mlog.addDebug('Could not find right parent tag. Error Occured')
|
||||
mlog.showDebug()
|
||||
if foundrighttag == True:
|
||||
contenttag.insert(0, allfactsparent)
|
||||
if SHOWDEBUG2 == True:
|
||||
mlog.addTextAndTag(['added parent'],[soup.prettify()])
|
||||
mlog.showDebug()
|
||||
except:
|
||||
errorOccured=True
|
||||
mlog.addTrace()
|
||||
else:
|
||||
errorOccured=True
|
||||
if SHOWDEBUG0 == True and errorOccured == True:
|
||||
mlog.addTextAndTag(['no articlefacts'],[articlefacts])
|
||||
mlog.showDebug()
|
||||
pass
|
||||
return soup
|
||||
|
||||
def moveTitleAndAuthor(self, soup):
|
||||
moveitem = soup.h1
|
||||
pubdate = soup.find(id="date")
|
||||
if moveitem and not moveitem == None and pubdate and not pubdate == None:
|
||||
try:
|
||||
pubdate.parent.insert(0, moveitem)
|
||||
except:
|
||||
print '\n!!error in moving title!!\n'
|
||||
pass
|
||||
moveitem = None
|
||||
moveitem = soup.find('div', {'class':'byline'})
|
||||
if moveitem and not moveitem == None:
|
||||
try:
|
||||
moveitem.parent.parent.insert(-1, moveitem)
|
||||
except:
|
||||
print '\n!!error in moving byline!!\n'
|
||||
pass
|
||||
return soup
|
||||
|
||||
def previousNextSibRemover(self, soup, previous=True, soupIsArray=False):
|
||||
findsibsof = soup
|
||||
firstpart = previous
|
||||
if findsibsof and not findsibsof == None:
|
||||
if soupIsArray == True:
|
||||
for foundsib in findsibsof:
|
||||
self.previousNextSibRemover(foundsib, firstpart, soupIsArray=False)
|
||||
else:
|
||||
if firstpart == True and soupIsArray == False:
|
||||
sibs = findsibsof.previousSiblingGenerator()
|
||||
else:
|
||||
sibs = findsibsof.nextSiblingGenerator()
|
||||
for sib in sibs:
|
||||
self.myKiller.safeRemovePart(sib, True)
|
||||
else:
|
||||
if SHOWDEBUG1 == True:
|
||||
mlog.addDebug('Not any sib found')
|
||||
return
|
||||
|
||||
def removeUnwantedTags(self,soup):
|
||||
if SHOWDEBUG1 == True:
|
||||
mlog.addTextAndTag(['Len of Soup before RemoveTagsByName'],[len(str(soup))])
|
||||
mlog.showDebug()
|
||||
self.removeTagsByName(soup)
|
||||
if SHOWDEBUG1 == True:
|
||||
mlog.addDebug('Len of Soup before firstandlastpart: %s' % len(str(soup)))
|
||||
mlog.showDebug()
|
||||
self.insertFacts(soup)
|
||||
self.removeFirstAndLastPart(soup)
|
||||
if SHOWDEBUG1 == True:
|
||||
mlog.addDebug('Len of Soup before unwantedpart: %s' % len(str(soup)))
|
||||
mlog.showDebug()
|
||||
self.removeUnwantedParts(soup)
|
||||
if SHOWDEBUG1 == True:
|
||||
mlog.addDebug('Len of Soup before EmptyParts: %s' % len(str(soup)))
|
||||
mlog.showDebug()
|
||||
self.removeEmptyTags(soup)
|
||||
if SHOWDEBUG1 == True:
|
||||
mlog.addDebug('Len of Soup after EmptyParts: %s' % len(str(soup)))
|
||||
mlog.showDebug()
|
||||
self.myReplacer.replaceATag(soup)
|
||||
return soup
|
||||
|
||||
def removeUnwantedParts(self, soup):
|
||||
if SHOWDEBUG1 == True:
|
||||
mlog.addDebug('Len of Soup before UnwantedID: %s' % len(str(soup)))
|
||||
mlog.showDebug()
|
||||
self.removeUnwantedTagsByID(soup)
|
||||
if SHOWDEBUG1 == True:
|
||||
mlog.addDebug('Len of Soup before Class: %s' % len(str(soup)))
|
||||
mlog.showDebug()
|
||||
self.removeUnwantedTagsByClass(soup)
|
||||
if SHOWDEBUG1 == True:
|
||||
mlog.addDebug('Len of Soup before Style: %s' % len(str(soup)))
|
||||
mlog.showDebug()
|
||||
self.removeUnwantedTagsByStyle(soup)
|
||||
return soup
|
||||
|
||||
def removeUnwantedTagsByStyle(self,soup):
|
||||
self.removeArrayOfTags(soup.findAll(attrs={'style' : re.compile("^(.*(display\s?:\s?none|img-mask|white)\s?;?.*)$")}))
|
||||
if SHOWDEBUG0 == True:
|
||||
mlog.addDebug('end remove by style')
|
||||
self.removeArrayOfTags(soup.findAll(attrs={'class': 'share-tools-bottom'})) # at end to keep author
|
||||
return soup
|
||||
|
||||
def removeArrayOfTags(self,souparray):
|
||||
return self.myKiller.safeRemovePart(souparray, True)
|
||||
|
||||
def removeUnwantedTagsByClass(self,soup):
|
||||
if SHOWDEBUG0 == True:
|
||||
mlog.addDebug('start remove by class')
|
||||
self.removeArrayOfTags(soup.findAll("div", { "class" :re.compile('^(promo.*?|article-tools-below-title|metroCommentFormWrap|ad|share-tools|tools|header-links|related-links|padding-top-15)$')}))
|
||||
return soup
|
||||
|
||||
def removeUnwantedTagsByID(self,soup):
|
||||
defaultids = ['footer-extra',re.compile('^ad(\d+|adcomp.*?)?$'),'column-4-5','navigation','header',re.compile('^column-1-5-(top|bottom)$'),'footer','hidden_div','sidebar',re.compile('^article-\d$'),'comments','footer']
|
||||
for removeid in defaultids:
|
||||
if SHOWDEBUG1 == True:
|
||||
mlog.addDebug('RemoveTagByID, tag: %s, Len of Soup: %s' % (str(removeid), len(str(soup))))
|
||||
mlog.showDebug()
|
||||
self.removeArrayOfTags(soup.findAll(id=removeid))
|
||||
return soup
|
||||
|
||||
# def safeRemoveTag(self, subtree):
|
||||
# return self.myKiller.safeRemovePart(subtree, True)
|
||||
|
||||
|
||||
def removeTagsByName(self, soup):
|
||||
self.myKiller.safeRemovePart(soup.script, True)
|
||||
self.myKiller.safeRemovePart(soup.iframe, True)
|
||||
self.myKiller.safeRemovePart(soup.style, True)
|
||||
self.myKiller.safeRemovePart(soup.noscript, True)
|
||||
return soup
|
||||
|
||||
def removeEmptyTags(self,soup,run=0):
|
||||
if SHOWDEBUG0 == True:
|
||||
mlog.addDebug('starting removeEmptyTags')
|
||||
if SHOWDEBUG1 == True:
|
||||
run += 1
|
||||
mlog.addDebug(run)
|
||||
if SHOWDEBUG2 == True:
|
||||
mlog.addDebug(str(soup.prettify()))
|
||||
mlog.showDebug()
|
||||
emptymatches = re.compile('^( |\s|\n|\r|\t)*$')
|
||||
emptymatches = re.compile('^[ \s\n\r\t ]*$')
|
||||
emptytags = soup.findAll(lambda tag: tag.find(True) is None and (tag.string is None or tag.string.strip()=="" or tag.string.strip()==emptymatches) and not tag.isSelfClosing)
|
||||
if emptytags and not (emptytags == None or emptytags == []):
|
||||
if SHOWDEBUG1 == True:
|
||||
mlog.addDebug('tags found')
|
||||
mlog.addDebug(str(emptytags))
|
||||
self.removeArrayOfTags(emptytags)
|
||||
#recursive in case removing empty tag creates new empty tag
|
||||
self.removeEmptyTags(soup, run=run)
|
||||
else:
|
||||
if SHOWDEBUG1 == True:
|
||||
mlog.addDebug('no empty tags found')
|
||||
mlog.showDebug()
|
||||
if SHOWDEBUG0 == True:
|
||||
if SHOWDEBUG2 == True:
|
||||
mlog.addDebug('new soup:')
|
||||
mlog.addDebug(str(soup.prettify()))
|
||||
mlog.addDebug('RemoveEmptyTags Completed')
|
||||
mlog.showDebug()
|
||||
return soup
|
||||
|
||||
def removeFirstAndLastPart(self,soup):
|
||||
def findparenttag(lookuptag):
|
||||
if lookuptag and not lookuptag == None:
|
||||
return lookuptag.findParents()
|
||||
findtag = soup.find(id="date")
|
||||
self.previousNextSibRemover(findtag, previous=True, soupIsArray=False)
|
||||
self.previousNextSibRemover(findparenttag(findtag), previous=True, soupIsArray=True)
|
||||
for endtag in [soup.find(id="share-and-byline"), soup.find("div", { "class" : "gallery-text" })]:
|
||||
self.previousNextSibRemover(endtag, previous=False, soupIsArray=False)
|
||||
self.previousNextSibRemover(findparenttag(endtag), previous=False, soupIsArray=True)
|
||||
return soup
|
||||
return soup
|
@ -1,52 +1,30 @@
|
||||
import re
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
class AdvancedUserRecipe1306097511(BasicNewsRecipe):
|
||||
title = u'Metro UK'
|
||||
description = 'News as provide by The Metro -UK'
|
||||
|
||||
#timefmt = ''
|
||||
__author__ = 'Dave Asbury'
|
||||
#last update 3/12/11
|
||||
cover_url = 'http://profile.ak.fbcdn.net/hprofile-ak-snc4/276636_117118184990145_2132092232_n.jpg'
|
||||
no_stylesheets = True
|
||||
#no_stylesheets = True
|
||||
oldest_article = 1
|
||||
max_articles_per_feed = 20
|
||||
max_articles_per_feed = 10
|
||||
remove_empty_feeds = True
|
||||
remove_javascript = True
|
||||
auto_cleanup = True
|
||||
|
||||
#preprocess_regexps = [(re.compile(r'Tweet'), lambda a : '')]
|
||||
preprocess_regexps = [
|
||||
(re.compile(r'<span class="img-cap legend">', re.IGNORECASE | re.DOTALL), lambda match: '<p></p><span class="img-cap legend"> ')]
|
||||
preprocess_regexps = [
|
||||
(re.compile(r'tweet', re.IGNORECASE | re.DOTALL), lambda match: '')]
|
||||
|
||||
language = 'en_GB'
|
||||
|
||||
|
||||
masthead_url = 'http://e-edition.metro.co.uk/images/metro_logo.gif'
|
||||
|
||||
|
||||
keep_only_tags = [
|
||||
dict(name='h1'),dict(name='h2', attrs={'class':'h2'}),
|
||||
dict(attrs={'class':['img-cnt figure']}),
|
||||
dict(attrs={'class':['art-img']}),
|
||||
dict(name='div', attrs={'class':'art-lft'}),
|
||||
dict(name='p')
|
||||
|
||||
]
|
||||
remove_tags = [
|
||||
dict(name = 'div',attrs={'id' : ['comments-news','formSubmission']}),
|
||||
dict(name='div', attrs={'class':[ 'news m12 clrd clr-b p5t shareBtm', 'commentForm', 'metroCommentInnerWrap',
|
||||
'art-rgt','pluck-app pluck-comm','news m12 clrd clr-l p5t', 'flt-r','username','clrd' ]}),
|
||||
dict(attrs={'class':['username', 'metroCommentFormWrap','commentText','commentsNav','avatar','submDateAndTime','addYourComment','displayName']})
|
||||
,dict(name='div', attrs={'class' : 'clrd art-fd fd-gr1-b'})
|
||||
|
||||
]
|
||||
|
||||
|
||||
feeds = [
|
||||
(u'News', u'http://www.metro.co.uk/rss/news/'), (u'Money', u'http://www.metro.co.uk/rss/money/'), (u'Sport', u'http://www.metro.co.uk/rss/sport/'), (u'Film', u'http://www.metro.co.uk/rss/metrolife/film/'), (u'Music', u'http://www.metro.co.uk/rss/metrolife/music/'), (u'TV', u'http://www.metro.co.uk/rss/tv/'), (u'Showbiz', u'http://www.metro.co.uk/rss/showbiz/'), (u'Weird News', u'http://www.metro.co.uk/rss/weird/'), (u'Travel', u'http://www.metro.co.uk/rss/travel/'), (u'Lifestyle', u'http://www.metro.co.uk/rss/lifestyle/'), (u'Books', u'http://www.metro.co.uk/rss/lifestyle/books/'), (u'Food', u'http://www.metro.co.uk/rss/lifestyle/restaurants/')]
|
||||
|
||||
extra_css = '''
|
||||
body {font: sans-serif medium;}'
|
||||
h1 {text-align : center; font-family:Arial,Helvetica,sans-serif; font-size:20px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:bold;}
|
||||
h2 {text-align : center;color:#4D4D4D;font-family:Arial,Helvetica,sans-serif; font-size:15px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:bold; }
|
||||
span{ font-size:9.5px; font-weight:bold;font-style:italic}
|
||||
p { text-align: justify; font-family:Arial,Helvetica,sans-serif; font-size:11px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:normal;}
|
||||
|
||||
'''
|
||||
body{ text-align: justify; font-family:Arial,Helvetica,sans-serif; font-size:11px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:normal;}
|
||||
'''
|
||||
|
@ -9,8 +9,9 @@ from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||
class recipeMagic(BasicNewsRecipe):
|
||||
title = 'National Geographic PL'
|
||||
__author__ = 'Marcin Urban 2011'
|
||||
__modified_by__ = 'fenuks'
|
||||
description = 'legenda wśród magazynów z historią sięgającą 120 lat'
|
||||
cover_url = 'http://www.guj.pl/var/guj/storage/images/media/nasze_magazyny/national_geographic/logo/ng_logo/2606-1-pol-PL/ng_logo.jpg'
|
||||
#cover_url = 'http://www.guj.pl/var/guj/storage/images/media/nasze_magazyny/national_geographic/logo/ng_logo/2606-1-pol-PL/ng_logo.jpg'
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 100
|
||||
no_stylesheets = True
|
||||
@ -42,11 +43,43 @@ class recipeMagic(BasicNewsRecipe):
|
||||
]
|
||||
|
||||
remove_attributes = ['width','height']
|
||||
feeds=[]
|
||||
|
||||
feeds = [
|
||||
('National Geographic PL', 'http://www.national-geographic.pl/rss/'),
|
||||
]
|
||||
def find_articles(self, url):
|
||||
articles = []
|
||||
soup=self.index_to_soup(url)
|
||||
tag=soup.find(attrs={'class':'arl'})
|
||||
art=tag.ul.findAll('li')
|
||||
for i in art:
|
||||
title=i.a['title']
|
||||
url=i.a['href']
|
||||
#date=soup.find(id='footer').ul.li.string[41:-1]
|
||||
desc=i.div.p.string
|
||||
articles.append({'title' : title,
|
||||
'url' : url,
|
||||
'date' : '',
|
||||
'description' : desc
|
||||
})
|
||||
return articles
|
||||
|
||||
def parse_index(self):
|
||||
feeds = []
|
||||
feeds.append((u"Aktualności", self.find_articles('http://www.national-geographic.pl/aktualnosci/')))
|
||||
feeds.append((u"Artykuły", self.find_articles('http://www.national-geographic.pl/artykuly/')))
|
||||
|
||||
return feeds
|
||||
|
||||
def print_version(self, url):
|
||||
return url.replace('artykuly0Cpokaz', 'drukuj-artykul')
|
||||
if 'artykuly' in url:
|
||||
return url.replace('artykuly/pokaz', 'drukuj-artykul')
|
||||
elif 'aktualnosci' in url:
|
||||
return url.replace('aktualnosci/pokaz', 'drukuj-artykul')
|
||||
else:
|
||||
return url
|
||||
|
||||
def get_cover_url(self):
|
||||
soup = self.index_to_soup('http://www.national-geographic.pl/biezace-wydania/')
|
||||
tag=soup.find(attrs={'class':'txt jus'})
|
||||
self.cover_url=tag.img['src']
|
||||
return getattr(self, 'cover_url', self.cover_url)
|
||||
|
||||
|
16
recipes/non_leggerlo.recipe
Normal file
16
recipes/non_leggerlo.recipe
Normal file
@ -0,0 +1,16 @@
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class AdvancedUserRecipe1335362999(BasicNewsRecipe):
|
||||
title = u'Non leggerlo'
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 100
|
||||
auto_cleanup = False
|
||||
keep_only_tags = [
|
||||
dict(name='div', attrs={'class':'post hentry'})
|
||||
]
|
||||
feeds = [(u'Non leggerlo', u'http://nonleggerlo.blogspot.com/feeds/posts/default')]
|
||||
description = 'An Italian satirical blog'
|
||||
language = 'it'
|
||||
__author__ = 'faber1971'
|
||||
__version__ = 'v1.0'
|
||||
__date__ = '24, April 2012'
|
@ -81,5 +81,7 @@ class Nowa_Fantastyka(BasicNewsRecipe):
|
||||
title=soup.find(attrs={'class':'tytul'})
|
||||
if title:
|
||||
title['style']='font-size: 20px; font-weight: bold;'
|
||||
self.log.warn(soup)
|
||||
for a in soup('a'):
|
||||
if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']:
|
||||
a['href']=self.INDEX + a['href']
|
||||
return soup
|
||||
|
@ -1,3 +1,4 @@
|
||||
import urllib, re
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class AdvancedUserRecipe1279258912(BasicNewsRecipe):
|
||||
@ -27,12 +28,30 @@ class AdvancedUserRecipe1279258912(BasicNewsRecipe):
|
||||
encoding = 'utf-8'
|
||||
conversion_options = {'linearize_tables':True}
|
||||
masthead_url = 'http://www.orlandosentinel.com/media/graphic/2009-07/46844851.gif'
|
||||
keep_only_tags = [
|
||||
dict(name='div', attrs={'class':'story'})
|
||||
]
|
||||
remove_tags = [
|
||||
dict(name='div', attrs={'class':['articlerail','tools','comment-group','clearfix']}),
|
||||
]
|
||||
remove_tags_after = [
|
||||
dict(name='p', attrs={'class':'copyright'}),
|
||||
]
|
||||
|
||||
auto_cleanup = True
|
||||
|
||||
def get_article_url(self, article):
|
||||
ans = None
|
||||
try:
|
||||
s = article.summary
|
||||
ans = urllib.unquote(
|
||||
re.search(r'href=".+?bookmark.cfm.+?link=(.+?)"', s).group(1))
|
||||
except:
|
||||
pass
|
||||
if ans is None:
|
||||
link = article.get('feedburner_origlink', None)
|
||||
if link and link.split('/')[-1]=="story01.htm":
|
||||
link=link.split('/')[-2]
|
||||
encoding = {'0B': '.', '0C': '/', '0A': '0', '0F': '=', '0G': '&',
|
||||
'0D': '?', '0E': '-', '0N': '.com', '0L': 'http:',
|
||||
'0S':'//'}
|
||||
for k, v in encoding.iteritems():
|
||||
link = link.replace(k, v)
|
||||
ans = link
|
||||
elif link:
|
||||
ans = link
|
||||
if ans is not None:
|
||||
return ans.replace('?track=rss', '')
|
||||
|
||||
|
||||
|
@ -7,6 +7,7 @@ class PC_Arena(BasicNewsRecipe):
|
||||
description = u'Najnowsze informacje z branży IT - testy, recenzje, aktualności, rankingi, wywiady. Twoje źródło informacji o sprzęcie komputerowym.'
|
||||
category = 'IT'
|
||||
language = 'pl'
|
||||
index='http://pcarena.pl'
|
||||
masthead_url='http://pcarena.pl/pcarena/img/logo.png'
|
||||
cover_url= 'http://pcarena.pl/pcarena/img/logo.png'
|
||||
no_stylesheets = True
|
||||
@ -22,4 +23,10 @@ class PC_Arena(BasicNewsRecipe):
|
||||
if 'http' not in url:
|
||||
return 'http://pcarena.pl' + url
|
||||
else:
|
||||
return url
|
||||
return url
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
for a in soup('a'):
|
||||
if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']:
|
||||
a['href']=self.index + a['href']
|
||||
return soup
|
@ -1,5 +1,5 @@
|
||||
"""
|
||||
readitlaterlist.com
|
||||
Pocket Calibre Recipe v1.0
|
||||
"""
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '''
|
||||
@ -12,22 +12,23 @@ from calibre import strftime
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
|
||||
class Readitlater(BasicNewsRecipe):
|
||||
title = 'ReadItLater'
|
||||
class Pocket(BasicNewsRecipe):
|
||||
title = 'Pocket'
|
||||
__author__ = 'Darko Miletic, Przemyslaw Kryger, Keith Callenberg, tBunnyMan'
|
||||
description = '''Personalized news feeds. Go to readitlaterlist.com to setup \
|
||||
up your news. This version displays pages of articles from \
|
||||
description = '''Personalized news feeds. Go to getpocket.com to setup up \
|
||||
your news. This version displays pages of articles from \
|
||||
oldest to newest, with max & minimum counts, and marks articles \
|
||||
read after downloading.'''
|
||||
publisher = 'readitlaterlist.com'
|
||||
publisher = 'getpocket.com'
|
||||
category = 'news, custom'
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 50
|
||||
minimum_articles = 1
|
||||
minimum_articles = 10
|
||||
mark_as_read_after_dl = True
|
||||
no_stylesheets = True
|
||||
use_embedded_content = False
|
||||
needs_subscription = True
|
||||
INDEX = u'http://readitlaterlist.com'
|
||||
INDEX = u'http://getpocket.com'
|
||||
LOGIN = INDEX + u'/l'
|
||||
readList = []
|
||||
|
||||
@ -100,9 +101,31 @@ class Readitlater(BasicNewsRecipe):
|
||||
br = self.get_browser()
|
||||
for link in markList:
|
||||
url = self.INDEX + link
|
||||
print 'Marking read: ', url
|
||||
response = br.open(url)
|
||||
response
|
||||
print response.info()
|
||||
|
||||
def cleanup(self):
|
||||
self.mark_as_read(self.readList)
|
||||
if self.mark_as_read_after_dl:
|
||||
self.mark_as_read(self.readList)
|
||||
else:
|
||||
pass
|
||||
|
||||
def default_cover(self, cover_file):
|
||||
'''
|
||||
Create a generic cover for recipes that don't have a cover
|
||||
This override adds time to the cover
|
||||
'''
|
||||
try:
|
||||
from calibre.ebooks import calibre_cover
|
||||
title = self.title if isinstance(self.title, unicode) else \
|
||||
self.title.decode('utf-8', 'replace')
|
||||
date = strftime(self.timefmt)
|
||||
time = strftime('[%I:%M %p]')
|
||||
img_data = calibre_cover(title, date, time)
|
||||
cover_file.write(img_data)
|
||||
cover_file.flush()
|
||||
except:
|
||||
self.log.exception('Failed to generate default cover')
|
||||
return False
|
||||
return True
|
||||
|
59
recipes/richmond_times_dispatch.recipe
Normal file
59
recipes/richmond_times_dispatch.recipe
Normal file
@ -0,0 +1,59 @@
|
||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||
|
||||
class AdvancedUserRecipe1335532466(BasicNewsRecipe):
|
||||
title = u'Richmond Times-Dispatch'
|
||||
description = 'News from Richmond, Virginia, USA'
|
||||
__author__ = 'jde'
|
||||
cover_url = 'http://static2.dukecms.com/va_tn/timesdispatch_com/site-media/img/icons/logo252x97.png'
|
||||
language = 'en'
|
||||
encoding = 'utf8'
|
||||
oldest_article = 1 #days
|
||||
max_articles_per_feed = 25
|
||||
needs_subscription = False
|
||||
remove_javascript = True
|
||||
recursions = 0
|
||||
use_embedded_content = False
|
||||
no_stylesheets = True
|
||||
auto_cleanup = True
|
||||
|
||||
feeds = [
|
||||
|
||||
('News',
|
||||
'http://www2.timesdispatch.com/list/feed/rss/news-archive'),
|
||||
('Breaking News',
|
||||
'http://www2.timesdispatch.com/list/feed/rss/breaking-news'),
|
||||
('National News',
|
||||
'http://www2.timesdispatch.com/list/feed/rss/national-news'),
|
||||
('Local News',
|
||||
'http://www2.timesdispatch.com/list/feed/rss/local-news'),
|
||||
('Business',
|
||||
'http://www2.timesdispatch.com/list/feed/rss/business'),
|
||||
('Local Business',
|
||||
'http://www2.timesdispatch.com/list/feed/rss/local-business'),
|
||||
('Politics',
|
||||
'http://www2.timesdispatch.com/list/feed/rss/politics'),
|
||||
('Virginia Politics',
|
||||
'http://www2.timesdispatch.com/list/feed/rss/virginia-politics'),
|
||||
('Editorials',
|
||||
'http://www2.timesdispatch.com/list/feed/rss/editorial-desk'),
|
||||
('Columnists and Blogs',
|
||||
'http://www2.timesdispatch.com/list/feed/rss/news-columnists-blogs'),
|
||||
('Opinion Columnists',
|
||||
'http://www2.timesdispatch.com/list/feed/rss/opinion-editorial-columnists'),
|
||||
('Letters to the Editor',
|
||||
'http://www2.timesdispatch.com/list/feed/rss/opinion-letters'),
|
||||
('Traffic',
|
||||
'http://www2.timesdispatch.com/list/feed/rss/traffic'),
|
||||
('Sports',
|
||||
'http://www2.timesdispatch.com/list/feed/rss/sports2'),
|
||||
('Entertainment/Life',
|
||||
'http://www2.timesdispatch.com/list/feed/rss/entertainment'),
|
||||
('Movies',
|
||||
'http://www2.timesdispatch.com/list/feed/rss/movies'),
|
||||
('Music',
|
||||
'http://www2.timesdispatch.com/list/feed/rss/music'),
|
||||
('Dining & Food',
|
||||
'http://www2.timesdispatch.com/list/feed/rss/dining'),
|
||||
|
||||
]
|
||||
|
141
recipes/sol_haber.recipe
Normal file
141
recipes/sol_haber.recipe
Normal file
@ -0,0 +1,141 @@
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
from __future__ import unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2012, Onur Gungor onurgu@gmail.com'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
'''
|
||||
www.sol.org.tr
|
||||
'''
|
||||
|
||||
import datetime
|
||||
|
||||
import re
|
||||
|
||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||
|
||||
class SolHaberRecipe(BasicNewsRecipe):
|
||||
title = u'soL Haber'
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 100
|
||||
|
||||
language = 'tr'
|
||||
__author__ = 'Onur Güngör'
|
||||
description = 'Hayata soL''dan bakın..'
|
||||
publisher = 'soL Haber'
|
||||
tags = 'news, haberler, siyaset, türkiye, turkey, politics'
|
||||
|
||||
|
||||
conversion_options = {
|
||||
'comment' : description
|
||||
, 'tags' : tags
|
||||
, 'publisher' : publisher
|
||||
, 'language' : language
|
||||
}
|
||||
|
||||
category_dict = { 'sonuncu-kavga':'Sonuncu Kavga',
|
||||
'devlet-ve-siyaset':'Devlet ve Siyaset',
|
||||
'ekonomi':'Ekonomi',
|
||||
'enternasyonal-gundem':'Enternasyonel Gündem',
|
||||
'kent-gundemleri':'Kent Gündemleri',
|
||||
'kultur-sanat':'Kültür Sanat',
|
||||
'dunyadan':'Dünyadan',
|
||||
'serbest-kursu':'Serbest Kürsü',
|
||||
'medya':'Medya',
|
||||
'liseliler':'Liseliler',
|
||||
'yazarlar':'Köşe Yazıları'}
|
||||
|
||||
end_date = datetime.date.today().isoformat()
|
||||
start_date = (datetime.date.today()-datetime.timedelta(days=1)).isoformat()
|
||||
|
||||
|
||||
section_tuples = [['Köşe Yazıları', 'http://haber.sol.org.tr/arsiv?icerik=kose_yazisi&tarih%%5Bmin%%5D%%5Bdate%%5D=%s&tarih%%5Bmax%%5D%%5Bdate%%5D=%s' % (start_date, end_date)],
|
||||
['Haberler', 'http://haber.sol.org.tr/arsiv?icerik=haber&tarih%%5Bmin%%5D%%5Bdate%%5D=%s&tarih%%5Bmax%%5D%%5Bdate%%5D=%s' % (start_date, end_date)],
|
||||
['soL postal', 'http://haber.sol.org.tr/arsiv?icerik=postal&tarih%%5Bmin%%5D%%5Bdate%%5D=%s&tarih%%5Bmax%%5D%%5Bdate%%5D=%s' % (start_date, end_date)],
|
||||
['Bizim Amerika', 'http://haber.sol.org.tr/arsiv?icerik=bizim_amerika&tarih%%5Bmin%%5D%%5Bdate%%5D=%s&tarih%%5Bmax%%5D%%5Bdate%%5D=%s' % (start_date, end_date)]]
|
||||
|
||||
|
||||
# Disable stylesheets from site.
|
||||
no_stylesheets = True
|
||||
|
||||
cover_margins = (20, 20, '#ffffff')
|
||||
|
||||
storybody_reg_exp = '^\s*(haber|kose)\s*$'
|
||||
|
||||
comments_reg_exp = '^\s*makale-elestiri\s*$'
|
||||
|
||||
remove_tags = [dict(name='div', attrs={'class':re.compile(comments_reg_exp, re.IGNORECASE)})]
|
||||
|
||||
keep_only_tags = [dict(name='div', attrs={'class':re.compile(storybody_reg_exp, re.IGNORECASE)})]
|
||||
|
||||
def get_masthead_title(self):
|
||||
return self.title + "(" + self.end_date + ")"
|
||||
|
||||
def parse_index(self):
|
||||
|
||||
result = []
|
||||
articles_dict = dict()
|
||||
|
||||
author_regexp = re.compile('^http://.*?/yazarlar/(.*?)/.*$')
|
||||
category_regexp = re.compile('^http://.*?/(.+?)/.*$')
|
||||
|
||||
for section_tuple in self.section_tuples:
|
||||
|
||||
section_title = section_tuple[0]
|
||||
section_index_url = section_tuple[1]
|
||||
|
||||
self.log('Bölüm:', section_title, 'URL:', section_index_url)
|
||||
|
||||
soup = self.index_to_soup(section_index_url)
|
||||
|
||||
logo = soup.find('div', id='logo').find('img', src=True)
|
||||
if logo is not None:
|
||||
self.cover_url = logo['src']
|
||||
if self.cover_url.startswith('/'):
|
||||
self.cover_url = 'http://haber.sol.org.tr'+self.cover_url
|
||||
|
||||
view_content = soup.find('div', id='ana-icerik').find('div', attrs={'class':'view-content'})
|
||||
if view_content == None:
|
||||
break
|
||||
rows = view_content.find('tbody').findAll('tr')
|
||||
|
||||
self.log('Row sayısı', len(rows))
|
||||
for row in rows:
|
||||
cells = row.findAll('td')
|
||||
|
||||
a = cells[1].find('a', href=True)
|
||||
|
||||
url = a['href']
|
||||
title = self.tag_to_string(a)
|
||||
|
||||
if url.startswith('/'):
|
||||
url = 'http://haber.sol.org.tr'+url
|
||||
|
||||
category = section_title
|
||||
category_match_result = category_regexp.match(url)
|
||||
if category_match_result:
|
||||
category = category_match_result.group(1)
|
||||
|
||||
date = self.tag_to_string(cells[2])
|
||||
|
||||
author = 'soL haber'
|
||||
|
||||
author_match_result = author_regexp.match(url)
|
||||
if author_match_result:
|
||||
author = author_match_result.group(1)
|
||||
|
||||
self.log('\tFound article:', title, 'at', url, 'published at ', date, 'by', author)
|
||||
article = {'title':title, 'url':url, 'description':None, 'date':date, 'author':author}
|
||||
if category in articles_dict:
|
||||
articles_dict[category].append(article)
|
||||
else:
|
||||
articles_dict[category] = [article]
|
||||
|
||||
for category in articles_dict.keys():
|
||||
if category in self.category_dict:
|
||||
result.append((self.category_dict[category], articles_dict[category]))
|
||||
else:
|
||||
result.append((category, articles_dict[category]))
|
||||
|
||||
return result
|
25
recipes/swiat_obrazu.recipe
Normal file
25
recipes/swiat_obrazu.recipe
Normal file
@ -0,0 +1,25 @@
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class Swiat_Obrazu(BasicNewsRecipe):
|
||||
title = u'Swiat Obrazu'
|
||||
__author__ = 'fenuks'
|
||||
description = u'Internetowy Dziennik o Fotografii i Wideo www.SwiatObrazu.pl to źródło informacji o technice fotografii i wideo, o sprzęcie najbardziej znanych i uznanych firm: Canon, Nikon, Sony, Hasselblad i wielu innych. Znajdziecie tu programy do obróbki zdjęć, forum foto i forum wideo i galerie zdjęć. Codziennie najświeższe informacje: aktualności, testy, poradniki, wywiady, felietony. Swiatobrazu.pl stale organizuje konkursy oraz warsztaty fotograficzne i wideo.'
|
||||
category = 'photography'
|
||||
masthead_url = 'http://www.swiatobrazu.pl/img/logo.jpg'
|
||||
cover_url = 'http://www.swiatobrazu.pl/img/logo.jpg'
|
||||
language = 'pl'
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 100
|
||||
no_stylesheets = True
|
||||
remove_javascript= True
|
||||
use_embedded_content = False
|
||||
feeds = [(u'Wszystko', u'http://www.swiatobrazu.pl/rss')]
|
||||
|
||||
def print_version(self, url):
|
||||
return url + ',drukuj'
|
||||
|
||||
def image_url_processor(self, baseurl, url):
|
||||
if 'http://' not in url or 'https://' not in url:
|
||||
return 'http://www.swiatobrazu.pl' + url[5:]
|
||||
else:
|
||||
return url
|
@ -34,7 +34,7 @@ class TagesspiegelRSS(BasicNewsRecipe):
|
||||
no_javascript = True
|
||||
remove_empty_feeds = True
|
||||
encoding = 'utf-8'
|
||||
remove_tags = [{'class':'hcf-header'}, {'class':'hcf-atlas'}, {'class':'hcf-date hcf-separate'}]
|
||||
remove_tags = [{'class':'hcf-header'}, {'class':'hcf-atlas'}, {'class':'hcf-colon'}, {'class':'hcf-date hcf-separate'}]
|
||||
|
||||
def print_version(self, url):
|
||||
url = url.split('/')
|
||||
@ -51,6 +51,7 @@ class TagesspiegelRSS(BasicNewsRecipe):
|
||||
return ''.join(div.findAll(text=True, recursive=False)).strip() if div is not None else None
|
||||
|
||||
articles = {}
|
||||
links = set()
|
||||
key = None
|
||||
ans = []
|
||||
maincol = soup.find('div', attrs={'class':re.compile('hcf-main-col')})
|
||||
@ -59,7 +60,7 @@ class TagesspiegelRSS(BasicNewsRecipe):
|
||||
|
||||
if div['class'] == 'hcf-header':
|
||||
try:
|
||||
key = string.capwords(feed_title(div.em.a))
|
||||
key = string.capwords(feed_title(div.em))
|
||||
articles[key] = []
|
||||
ans.append(key)
|
||||
except:
|
||||
@ -70,6 +71,12 @@ class TagesspiegelRSS(BasicNewsRecipe):
|
||||
if not a:
|
||||
continue
|
||||
url = 'http://www.tagesspiegel.de' + a['href']
|
||||
|
||||
# check for duplicates
|
||||
if url in links:
|
||||
continue
|
||||
links.add(url)
|
||||
|
||||
title = self.tag_to_string(a, use_alt=True).strip()
|
||||
description = ''
|
||||
pubdate = strftime('%a, %d %b')
|
||||
|
@ -34,4 +34,12 @@ class tanuki(BasicNewsRecipe):
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
self.append_page(soup, soup.body)
|
||||
for a in soup('a'):
|
||||
if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']:
|
||||
if 'tanuki-anime' in soup.title.string.lower():
|
||||
a['href']='http://anime.tanuki.pl' + a['href']
|
||||
elif 'tanuki-manga' in soup.title.string.lower():
|
||||
a['href']='http://manga.tanuki.pl' + a['href']
|
||||
elif 'tanuki-czytelnia' in soup.title.string.lower():
|
||||
a['href']='http://czytelnia.tanuki.pl' + a['href']
|
||||
return soup
|
62
recipes/telam.recipe
Normal file
62
recipes/telam.recipe
Normal file
@ -0,0 +1,62 @@
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2012, Darko Miletic <darko.miletic at gmail.com>'
|
||||
'''
|
||||
www.telam.com.ar
|
||||
'''
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class Telam(BasicNewsRecipe):
|
||||
title = 'Telam'
|
||||
__author__ = 'Darko Miletic'
|
||||
description = 'AGENCIA DE NOTICIAS DE LA REPUBLICA ARGENTINA'
|
||||
publisher = 'Telam S.E.'
|
||||
category = 'news, politics, Argentina'
|
||||
oldest_article = 2
|
||||
max_articles_per_feed = 200
|
||||
no_stylesheets = True
|
||||
encoding = 'utf8'
|
||||
use_embedded_content = False
|
||||
language = 'es_AR'
|
||||
remove_empty_feeds = True
|
||||
publication_type = 'newsportal'
|
||||
masthead_url = 'http://www.telam.com.ar/front/imagenes/encabezado/logotelam.jpg'
|
||||
extra_css = """
|
||||
body{font-family: Arial,Helvetica,sans-serif }
|
||||
img{margin-bottom: 0.4em; display:block}
|
||||
"""
|
||||
|
||||
conversion_options = {
|
||||
'comment' : description
|
||||
, 'tags' : category
|
||||
, 'publisher' : publisher
|
||||
, 'language' : language
|
||||
}
|
||||
|
||||
remove_tags = [dict(name=['meta','link'])]
|
||||
remove_tags_before = dict(attrs={'class':'nota_fecha'})
|
||||
remove_tags_after = dict(attrs={'class':'nota_completa'})
|
||||
remove_attributes = ['lang']
|
||||
|
||||
|
||||
feeds = [
|
||||
(u'Ultimas noticias', u'http://www.telam.com.ar/xml/rss/' )
|
||||
,(u'Politica' , u'http://www.telam.com.ar/xml/rss/1')
|
||||
,(u'Economia' , u'http://www.telam.com.ar/xml/rss/2')
|
||||
,(u'Sociedad' , u'http://www.telam.com.ar/xml/rss/3')
|
||||
,(u'Policiales' , u'http://www.telam.com.ar/xml/rss/4')
|
||||
,(u'Internacionales' , u'http://www.telam.com.ar/xml/rss/6')
|
||||
,(u'Espectaculos' , u'http://www.telam.com.ar/xml/rss/7')
|
||||
,(u'Cultura' , u'http://www.telam.com.ar/xml/rss/8')
|
||||
,(u'Deportes' , u'http://www.telam.com.ar/xml/rss/9')
|
||||
,(u'Telam Investiga' , u'http://www.telam.com.ar/xml/rss/5')
|
||||
]
|
||||
|
||||
def print_version(self, url):
|
||||
artid = url.rpartition('/')[2]
|
||||
return 'http://www.telam.com.ar/?codProg=imprimir-nota&id=' + artid
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
for item in soup.findAll(style=True):
|
||||
del item['style']
|
||||
return soup
|
@ -1,9 +1,8 @@
|
||||
import re
|
||||
import re, mechanize
|
||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||
class AdvancedUserRecipe1325006965(BasicNewsRecipe):
|
||||
|
||||
title = u'The Sun UK'
|
||||
cover_url = 'http://www.thesun.co.uk/img/global/new-masthead-logo.png'
|
||||
|
||||
description = 'A Recipe for The Sun tabloid UK'
|
||||
__author__ = 'Dave Asbury'
|
||||
@ -24,37 +23,69 @@ class AdvancedUserRecipe1325006965(BasicNewsRecipe):
|
||||
no_stylesheets = True
|
||||
|
||||
extra_css = '''
|
||||
body{ text-align: justify; font-family:Arial,Helvetica,sans-serif; font-size:11px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:normal;}
|
||||
'''
|
||||
body{ text-align: justify; font-family:Arial,Helvetica,sans-serif; font-size:11px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:normal;}
|
||||
'''
|
||||
|
||||
preprocess_regexps = [
|
||||
(re.compile(r'<div class="foot-copyright".*?</div>', re.IGNORECASE | re.DOTALL), lambda match: '')]
|
||||
(re.compile(r'<div class="foot-copyright".*?</div>', re.IGNORECASE | re.DOTALL), lambda match: '')]
|
||||
|
||||
|
||||
|
||||
keep_only_tags = [
|
||||
dict(name='h1'),dict(name='h2',attrs={'class' : 'medium centered'}),
|
||||
dict(name='div',attrs={'class' : 'text-center'}),
|
||||
dict(name='div',attrs={'id' : 'bodyText'})
|
||||
# dict(name='p')
|
||||
]
|
||||
dict(name='div',attrs={'class' : 'text-center'}),
|
||||
dict(name='div',attrs={'id' : 'bodyText'})
|
||||
# dict(name='p')
|
||||
]
|
||||
remove_tags=[
|
||||
#dict(name='head'),
|
||||
dict(attrs={'class' : ['mystery-meat-link','ltbx-container','ltbx-var ltbx-hbxpn','ltbx-var ltbx-nav-loop','ltbx-var ltbx-url']}),
|
||||
#dict(name='head'),
|
||||
dict(attrs={'class' : ['mystery-meat-link','ltbx-container','ltbx-var ltbx-hbxpn','ltbx-var ltbx-nav-loop','ltbx-var ltbx-url']}),
|
||||
dict(name='div',attrs={'class' : 'cf'}),
|
||||
dict(attrs={'title' : 'download flash'}),
|
||||
dict(attrs={'title' : 'download flash'}),
|
||||
dict(attrs={'style' : 'padding: 5px'})
|
||||
|
||||
]
|
||||
]
|
||||
|
||||
|
||||
feeds = [
|
||||
#(u'News', u'http://www.thesun.co.uk/sol/homepage/news/rss'),
|
||||
(u'News','http://feed43.com/2517447382644748.xml'),
|
||||
(u'Sport', u'http://feed43.com/4283846255668687.xml'),
|
||||
(u'Bizarre', u'http://feed43.com/0233840304242011.xml'),
|
||||
(u'Film',u'http://feed43.com/1307545221226200.xml'),
|
||||
(u'Music',u'http://feed43.com/1701513435064132.xml'),
|
||||
(u'Sun Woman',u'http://feed43.com/0022626854226453.xml'),
|
||||
]
|
||||
(u'News','http://feed43.com/2517447382644748.xml'),
|
||||
(u'Sport', u'http://feed43.com/4283846255668687.xml'),
|
||||
(u'Bizarre', u'http://feed43.com/0233840304242011.xml'),
|
||||
(u'Film',u'http://feed43.com/1307545221226200.xml'),
|
||||
(u'Music',u'http://feed43.com/1701513435064132.xml'),
|
||||
(u'Sun Woman',u'http://feed43.com/0022626854226453.xml'),
|
||||
]
|
||||
|
||||
def get_cover_url(self):
|
||||
soup = self.index_to_soup('http://www.politicshome.com/uk/latest_frontpage.html')
|
||||
# look for the block containing the sun button and url
|
||||
cov = soup.find(attrs={'style' : 'background-image: url(http://www.politicshome.com/images/sources/source_frontpage_button_84.gif);'})
|
||||
|
||||
|
||||
|
||||
#cov = soup.find(attrs={'id' : 'large'})
|
||||
cov2 = str(cov)
|
||||
|
||||
cov2='http://www.politicshome.com'+cov2[9:-133]
|
||||
#cov2 now contains url of the page containing pic
|
||||
|
||||
#cov2 now contains url of the page containing pic
|
||||
soup = self.index_to_soup(cov2)
|
||||
cov = soup.find(attrs={'id' : 'large'})
|
||||
cov2 = str(cov)
|
||||
cov2=cov2[27:-18]
|
||||
#cov2 now is pic url, now go back to original function
|
||||
|
||||
br = mechanize.Browser()
|
||||
br.set_handle_redirect(False)
|
||||
try:
|
||||
br.open_novisit(cov2)
|
||||
cover_url = cov2
|
||||
except:
|
||||
cover_url = 'http://www.thesun.co.uk/img/global/new-masthead-logo.png'
|
||||
|
||||
#cover_url = cov2
|
||||
#cover_url = 'http://www.thesun.co.uk/img/global/new-masthead-logo.png'
|
||||
return cover_url
|
||||
|
||||
|
||||
|
@ -11,6 +11,8 @@ class TPM_uk(BasicNewsRecipe):
|
||||
__author__ = 'Darko Miletic'
|
||||
description = 'Title says it all'
|
||||
publisher = "The Philosophers' Magazine"
|
||||
recipe_disabled = ('This recipe has been disabled as the website has'
|
||||
' started providing articles only in PDF form')
|
||||
category = 'philosophy, news'
|
||||
oldest_article = 25
|
||||
max_articles_per_feed = 200
|
||||
|
@ -2,65 +2,50 @@
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
from __future__ import with_statement
|
||||
|
||||
''' Changelog
|
||||
2012-04-27 DrMerry:
|
||||
Added cover picture
|
||||
removed some extra tags
|
||||
'''
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import re
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class Tweakers(BasicNewsRecipe):
|
||||
title = u'Tweakers.net - with Reactions'
|
||||
__author__ = 'Roedi06'
|
||||
title = u'Tweakers.net'
|
||||
__author__ = 'Kovid Goyal'
|
||||
language = 'nl'
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 100
|
||||
cover_url = 'http://img51.imageshack.us/img51/7470/tweakersnetebook.gif'
|
||||
oldest_article = 4
|
||||
max_articles_per_feed = 40
|
||||
cover_url = 'http://tweakers.net/ext/launch/g/logo.gif'
|
||||
|
||||
keep_only_tags = [dict(name='div', attrs={'class':'columnwrapper news'}),
|
||||
{'id':'reacties'},
|
||||
]
|
||||
keep_only_tags = [dict(name='div', attrs={'class':'columnwrapper news'})]
|
||||
|
||||
remove_tags = [dict(name='div', attrs={'id' : ['utracker']}),
|
||||
{'id' : ['channelNav']},
|
||||
{'id' : ['contentArea']},
|
||||
{'class' : ['breadCrumb']},
|
||||
{'class' : ['nextPrevious ellipsis']},
|
||||
{'class' : ['advertorial']},
|
||||
{'class' : ['sidebar']},
|
||||
{'class' : ['filterBox']},
|
||||
{'id' : ['toggleButtonTxt']},
|
||||
{'id' : ['socialButtons']},
|
||||
{'class' : ['button']},
|
||||
{'class' : ['textadTop']},
|
||||
{'class' : ['commentLink']},
|
||||
{'title' : ['Reageer op deze reactie']},
|
||||
{'class' : ['pageIndex']},
|
||||
{'class' : ['reactieHeader collapsed']},
|
||||
remove_tags = [dict(name='div', attrs={'class':'reacties'}),
|
||||
{'id' : ['utracker','socialButtons','b_ac']},
|
||||
{'class' : ['sidebar','advertorial']},
|
||||
{'class' : re.compile('nextPrevious')},
|
||||
]
|
||||
no_stylesheets=True
|
||||
filter_regexps = [r'ads\.doubleclick\.net',r'ad\.doubleclick\.net']
|
||||
|
||||
preprocess_regexps = [
|
||||
(re.compile(r'<hr*?>', re.IGNORECASE | re.DOTALL), lambda match : ''),
|
||||
(re.compile(r'<p>', re.IGNORECASE | re.DOTALL), lambda match : ''),
|
||||
(re.compile(r'</p>', re.IGNORECASE | re.DOTALL), lambda match : ''),
|
||||
(re.compile(r'<a.*?>'), lambda h1: '<b><u>'),
|
||||
(re.compile(r'</a>'), lambda h2: '</u></b>'),
|
||||
(re.compile(r'<span class="new">', re.IGNORECASE | re.DOTALL), lambda match : ''),
|
||||
(re.compile(r'</span>', re.IGNORECASE | re.DOTALL), lambda match : ''),
|
||||
(re.compile(r'<div class="moderation"><img src="http://tweakimg.net/g/if/comments/score_0'), lambda match : ' - moderated 0<div class="moderation"><img src="http://tweakimg.net/g/if/comments/score_0'),
|
||||
(re.compile(r'<div class="moderation"><img src="http://tweakimg.net/g/if/comments/score_1'), lambda match : ' - moderated +1<div class="moderation"><img src="http://tweakimg.net/g/if/comments/score_1'),
|
||||
(re.compile(r'<div class="moderation"><img src="http://tweakimg.net/g/if/comments/score_2'), lambda match : ' - moderated +2<div class="moderation"><img src="http://tweakimg.net/g/if/comments/score_2'),
|
||||
(re.compile(r'<div class="moderation"><img src="http://tweakimg.net/g/if/comments/score_3'), lambda match : ' - moderated +3<div class="moderation"><img src="http://tweakimg.net/g/if/comments/score_3'),
|
||||
(re.compile(r'<div class="moderation">.*?</div>'), lambda h1: ''),
|
||||
]
|
||||
feeds = [(u'Tweakers.net', u'http://tweakers.net/feeds/nieuws.xml')]
|
||||
|
||||
extra_css = '.reactieHeader { color: #333333; font-size: 6px; border-bottom:solid 2px #333333; border-top:solid 1px #333333; } \
|
||||
.reactieContent { font-family:"Times New Roman",Georgia,Serif; color: #000000; font-size: 8px; } \
|
||||
.quote { font-family:"Times New Roman",Georgia,Serif; padding-left:2px; border-left:solid 3px #666666; color: #666666; }'
|
||||
|
||||
|
||||
feeds = [(u'Tweakers.net', u'http://feeds.feedburner.com/tweakers/nieuws')]
|
||||
|
||||
def print_version(self, url):
|
||||
return url + '?max=200'
|
||||
def preprocess_html(self, soup):
|
||||
for a in soup.findAll('a', href=True, rel=True):
|
||||
if a['rel'].startswith('imageview'):
|
||||
a['src'] = a['href']
|
||||
del a['href']
|
||||
a.name = 'img'
|
||||
for x in a.findAll(True):
|
||||
x.extract()
|
||||
return soup
|
||||
|
||||
def postprocess_html(self, soup, first):
|
||||
for base in soup.findAll('base'):
|
||||
base.extract()
|
||||
return soup
|
19
recipes/vignette.recipe
Normal file
19
recipes/vignette.recipe
Normal file
@ -0,0 +1,19 @@
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class AdvancedUserRecipe1334935485(BasicNewsRecipe):
|
||||
title = u'Vignette'
|
||||
oldest_article = 15
|
||||
max_articles_per_feed = 100
|
||||
auto_cleanup = False
|
||||
keep_only_tags = [
|
||||
dict(name='div', attrs={'class':['HomeFirstNewsfoto', 'photo']}),
|
||||
dict(name='img', attrs={'class':'altan-big'})
|
||||
]
|
||||
masthead_url = 'http://vauro.globalist.it/vauroglobalistit/Img/vauro-logo-beta.gif'
|
||||
feeds = [(u'Altan', u'http://feed43.com/3556647724071522.xml'), (u'Ellekappa', u'http://ellekappa.tumblr.com/rss'), (u'Vauro', u'http://feeds.feedburner.com/vauro')]
|
||||
description = 'Ellekappa, Altan, Vauro - Italian best satirical cartoons'
|
||||
language = 'it'
|
||||
__author__ = 'faber1971'
|
||||
|
||||
__version__ = 'v1.0'
|
||||
__date__ = '24, April 2012'
|
@ -8,6 +8,7 @@ class webhosting_pl(BasicNewsRecipe):
|
||||
cover_url='http://webhosting.pl/images/logo.png'
|
||||
masthead_url='http://webhosting.pl/images/logo.png'
|
||||
oldest_article = 7
|
||||
index='http://webhosting.pl'
|
||||
max_articles_per_feed = 100
|
||||
no_stylesheets = True
|
||||
remove_empty_feeds = True
|
||||
@ -36,4 +37,10 @@ class webhosting_pl(BasicNewsRecipe):
|
||||
(u'Marketing', u'http://webhosting.pl/feed/rss/n/11535')]
|
||||
|
||||
def print_version(self, url):
|
||||
return url.replace('webhosting.pl', 'webhosting.pl/print')
|
||||
return url.replace('webhosting.pl', 'webhosting.pl/print')
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
for a in soup('a'):
|
||||
if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']:
|
||||
a['href']=self.index + a['href']
|
||||
return soup
|
@ -21,7 +21,7 @@ class XkcdCom(BasicNewsRecipe):
|
||||
|
||||
use_embedded_content = False
|
||||
oldest_article = 60
|
||||
keep_only_tags = [dict(id='middleContent')]
|
||||
keep_only_tags = [dict(id='middleContainer')]
|
||||
remove_tags = [dict(name='ul'), dict(name='h3'), dict(name='br')]
|
||||
no_stylesheets = True
|
||||
# turn image bubblehelp into a paragraph
|
||||
|
@ -377,7 +377,7 @@
|
||||
<xsl:apply-templates/><br/>
|
||||
</xsl:template>
|
||||
<!-- image -->
|
||||
<xsl:template match="fb:image">
|
||||
<xsl:template match="fb:body/fb:image|fb:section/fb:image">
|
||||
<div align="center">
|
||||
<xsl:element name="img">
|
||||
<xsl:attribute name="border">1</xsl:attribute>
|
||||
@ -395,4 +395,20 @@
|
||||
</xsl:element>
|
||||
</div>
|
||||
</xsl:template>
|
||||
<xsl:template match="fb:image">
|
||||
<xsl:element name="img">
|
||||
<xsl:choose>
|
||||
<xsl:when test="starts-with(@xlink:href,'#')">
|
||||
<xsl:attribute name="src"><xsl:value-of select="substring-after(@xlink:href,'#')"/></xsl:attribute>
|
||||
</xsl:when>
|
||||
<xsl:otherwise>
|
||||
<xsl:attribute name="src"><xsl:value-of select="@xlink:href"/></xsl:attribute>
|
||||
</xsl:otherwise>
|
||||
</xsl:choose>
|
||||
<xsl:if test="@title">
|
||||
<xsl:attribute name="title"><xsl:value-of select="@title"/></xsl:attribute>
|
||||
</xsl:if>
|
||||
</xsl:element>
|
||||
</xsl:template>
|
||||
|
||||
</xsl:stylesheet>
|
||||
|
@ -26,7 +26,7 @@ def login_to_google(username, password):
|
||||
br.form['Email'] = username
|
||||
br.form['Passwd'] = password
|
||||
raw = br.submit().read()
|
||||
if re.search(br'<title>.*?Account Settings</title>', raw) is None:
|
||||
if re.search(br'(?i)<title>.*?Account Settings</title>', raw) is None:
|
||||
x = re.search(br'(?is)<title>.*?</title>', raw)
|
||||
if x is not None:
|
||||
print ('Title of post login page: %s'%x.group())
|
||||
|
@ -12,14 +12,14 @@ msgstr ""
|
||||
"Report-Msgid-Bugs-To: Debian iso-codes team <pkg-isocodes-"
|
||||
"devel@lists.alioth.debian.org>\n"
|
||||
"POT-Creation-Date: 2011-11-25 14:01+0000\n"
|
||||
"PO-Revision-Date: 2011-12-14 19:48+0000\n"
|
||||
"Last-Translator: Ferran Rius <frius64@hotmail.com>\n"
|
||||
"PO-Revision-Date: 2012-04-12 09:56+0000\n"
|
||||
"Last-Translator: Dídac Rios <didac@niorcs.com>\n"
|
||||
"Language-Team: Catalan <linux@softcatala.org>\n"
|
||||
"MIME-Version: 1.0\n"
|
||||
"Content-Type: text/plain; charset=UTF-8\n"
|
||||
"Content-Transfer-Encoding: 8bit\n"
|
||||
"X-Launchpad-Export-Date: 2011-12-15 05:18+0000\n"
|
||||
"X-Generator: Launchpad (build 14487)\n"
|
||||
"X-Launchpad-Export-Date: 2012-04-13 05:26+0000\n"
|
||||
"X-Generator: Launchpad (build 15070)\n"
|
||||
"Language: ca\n"
|
||||
|
||||
#. name for aaa
|
||||
@ -9584,31 +9584,31 @@ msgstr ""
|
||||
|
||||
#. name for hoi
|
||||
msgid "Holikachuk"
|
||||
msgstr ""
|
||||
msgstr "Holikachuk"
|
||||
|
||||
#. name for hoj
|
||||
msgid "Hadothi"
|
||||
msgstr ""
|
||||
msgstr "Hadothi"
|
||||
|
||||
#. name for hol
|
||||
msgid "Holu"
|
||||
msgstr ""
|
||||
msgstr "Holu"
|
||||
|
||||
#. name for hom
|
||||
msgid "Homa"
|
||||
msgstr ""
|
||||
msgstr "Homa"
|
||||
|
||||
#. name for hoo
|
||||
msgid "Holoholo"
|
||||
msgstr ""
|
||||
msgstr "Holoholo"
|
||||
|
||||
#. name for hop
|
||||
msgid "Hopi"
|
||||
msgstr ""
|
||||
msgstr "Hopi"
|
||||
|
||||
#. name for hor
|
||||
msgid "Horo"
|
||||
msgstr ""
|
||||
msgstr "Horo"
|
||||
|
||||
#. name for hos
|
||||
msgid "Ho Chi Minh City Sign Language"
|
||||
@ -9616,15 +9616,15 @@ msgstr "Llenguatge de signes de la ciutat de Ho Chi Minh"
|
||||
|
||||
#. name for hot
|
||||
msgid "Hote"
|
||||
msgstr ""
|
||||
msgstr "Hote"
|
||||
|
||||
#. name for hov
|
||||
msgid "Hovongan"
|
||||
msgstr ""
|
||||
msgstr "Hovongan"
|
||||
|
||||
#. name for how
|
||||
msgid "Honi"
|
||||
msgstr ""
|
||||
msgstr "Honi"
|
||||
|
||||
#. name for hoy
|
||||
msgid "Holiya"
|
||||
@ -9636,7 +9636,7 @@ msgstr ""
|
||||
|
||||
#. name for hpo
|
||||
msgid "Hpon"
|
||||
msgstr ""
|
||||
msgstr "Hpon"
|
||||
|
||||
#. name for hps
|
||||
msgid "Hawai'i Pidgin Sign Language"
|
||||
@ -9644,35 +9644,35 @@ msgstr "Hawaià Pidgin; llenguatge de signes"
|
||||
|
||||
#. name for hra
|
||||
msgid "Hrangkhol"
|
||||
msgstr ""
|
||||
msgstr "Hrangkhol"
|
||||
|
||||
#. name for hre
|
||||
msgid "Hre"
|
||||
msgstr ""
|
||||
msgstr "Hre"
|
||||
|
||||
#. name for hrk
|
||||
msgid "Haruku"
|
||||
msgstr ""
|
||||
msgstr "Haruku"
|
||||
|
||||
#. name for hrm
|
||||
msgid "Miao; Horned"
|
||||
msgstr ""
|
||||
msgstr "Miao; Horned"
|
||||
|
||||
#. name for hro
|
||||
msgid "Haroi"
|
||||
msgstr ""
|
||||
msgstr "Haroi"
|
||||
|
||||
#. name for hrr
|
||||
msgid "Horuru"
|
||||
msgstr ""
|
||||
msgstr "Horuru"
|
||||
|
||||
#. name for hrt
|
||||
msgid "Hértevin"
|
||||
msgstr ""
|
||||
msgstr "Hértevin"
|
||||
|
||||
#. name for hru
|
||||
msgid "Hruso"
|
||||
msgstr ""
|
||||
msgstr "Hruso"
|
||||
|
||||
#. name for hrv
|
||||
msgid "Croatian"
|
||||
@ -9680,11 +9680,11 @@ msgstr "Croat"
|
||||
|
||||
#. name for hrx
|
||||
msgid "Hunsrik"
|
||||
msgstr ""
|
||||
msgstr "Hunsrik"
|
||||
|
||||
#. name for hrz
|
||||
msgid "Harzani"
|
||||
msgstr ""
|
||||
msgstr "Harzani"
|
||||
|
||||
#. name for hsb
|
||||
msgid "Sorbian; Upper"
|
||||
@ -9704,31 +9704,31 @@ msgstr "Xinès; Xiang"
|
||||
|
||||
#. name for hss
|
||||
msgid "Harsusi"
|
||||
msgstr ""
|
||||
msgstr "Harsusi"
|
||||
|
||||
#. name for hti
|
||||
msgid "Hoti"
|
||||
msgstr ""
|
||||
msgstr "Hoti"
|
||||
|
||||
#. name for hto
|
||||
msgid "Huitoto; Minica"
|
||||
msgstr ""
|
||||
msgstr "Huitoto; Minica"
|
||||
|
||||
#. name for hts
|
||||
msgid "Hadza"
|
||||
msgstr ""
|
||||
msgstr "Hadza"
|
||||
|
||||
#. name for htu
|
||||
msgid "Hitu"
|
||||
msgstr ""
|
||||
msgstr "Hitu"
|
||||
|
||||
#. name for htx
|
||||
msgid "Hittite; Middle"
|
||||
msgstr ""
|
||||
msgstr "Hittite; Middle"
|
||||
|
||||
#. name for hub
|
||||
msgid "Huambisa"
|
||||
msgstr ""
|
||||
msgstr "Huambisa"
|
||||
|
||||
#. name for huc
|
||||
msgid "=/Hua"
|
||||
@ -9736,27 +9736,27 @@ msgstr ""
|
||||
|
||||
#. name for hud
|
||||
msgid "Huaulu"
|
||||
msgstr ""
|
||||
msgstr "Huaulu"
|
||||
|
||||
#. name for hue
|
||||
msgid "Huave; San Francisco Del Mar"
|
||||
msgstr ""
|
||||
msgstr "Huave; San Francisco Del Mar"
|
||||
|
||||
#. name for huf
|
||||
msgid "Humene"
|
||||
msgstr ""
|
||||
msgstr "Humene"
|
||||
|
||||
#. name for hug
|
||||
msgid "Huachipaeri"
|
||||
msgstr ""
|
||||
msgstr "Huachipaeri"
|
||||
|
||||
#. name for huh
|
||||
msgid "Huilliche"
|
||||
msgstr ""
|
||||
msgstr "Huilliche"
|
||||
|
||||
#. name for hui
|
||||
msgid "Huli"
|
||||
msgstr ""
|
||||
msgstr "Huli"
|
||||
|
||||
#. name for huj
|
||||
msgid "Miao; Northern Guiyang"
|
||||
@ -9764,15 +9764,15 @@ msgstr "Miao; Guiyang septentrional"
|
||||
|
||||
#. name for huk
|
||||
msgid "Hulung"
|
||||
msgstr ""
|
||||
msgstr "Hulung"
|
||||
|
||||
#. name for hul
|
||||
msgid "Hula"
|
||||
msgstr ""
|
||||
msgstr "Hula"
|
||||
|
||||
#. name for hum
|
||||
msgid "Hungana"
|
||||
msgstr ""
|
||||
msgstr "Hungana"
|
||||
|
||||
#. name for hun
|
||||
msgid "Hungarian"
|
||||
@ -9780,43 +9780,43 @@ msgstr "Hongarès"
|
||||
|
||||
#. name for huo
|
||||
msgid "Hu"
|
||||
msgstr ""
|
||||
msgstr "Hu"
|
||||
|
||||
#. name for hup
|
||||
msgid "Hupa"
|
||||
msgstr ""
|
||||
msgstr "Hupa"
|
||||
|
||||
#. name for huq
|
||||
msgid "Tsat"
|
||||
msgstr ""
|
||||
msgstr "Tsat"
|
||||
|
||||
#. name for hur
|
||||
msgid "Halkomelem"
|
||||
msgstr ""
|
||||
msgstr "Halkomelem"
|
||||
|
||||
#. name for hus
|
||||
msgid "Huastec"
|
||||
msgstr ""
|
||||
msgstr "Huastec"
|
||||
|
||||
#. name for hut
|
||||
msgid "Humla"
|
||||
msgstr ""
|
||||
msgstr "Humla"
|
||||
|
||||
#. name for huu
|
||||
msgid "Huitoto; Murui"
|
||||
msgstr ""
|
||||
msgstr "Huitoto; Murui"
|
||||
|
||||
#. name for huv
|
||||
msgid "Huave; San Mateo Del Mar"
|
||||
msgstr ""
|
||||
msgstr "Huave; San Mateo Del Mar"
|
||||
|
||||
#. name for huw
|
||||
msgid "Hukumina"
|
||||
msgstr ""
|
||||
msgstr "Hukumina"
|
||||
|
||||
#. name for hux
|
||||
msgid "Huitoto; Nüpode"
|
||||
msgstr ""
|
||||
msgstr "Huitoto; Nüpode"
|
||||
|
||||
#. name for huy
|
||||
msgid "Hulaulá"
|
||||
|
@ -18,27 +18,27 @@ msgstr ""
|
||||
"Report-Msgid-Bugs-To: Debian iso-codes team <pkg-isocodes-"
|
||||
"devel@lists.alioth.debian.org>\n"
|
||||
"POT-Creation-Date: 2011-11-25 14:01+0000\n"
|
||||
"PO-Revision-Date: 2012-03-05 19:08+0000\n"
|
||||
"Last-Translator: Dennis Baudys <Unknown>\n"
|
||||
"PO-Revision-Date: 2012-04-21 14:42+0000\n"
|
||||
"Last-Translator: SimonFS <simonschuette@arcor.de>\n"
|
||||
"Language-Team: German <debian-l10n-german@lists.debian.org>\n"
|
||||
"MIME-Version: 1.0\n"
|
||||
"Content-Type: text/plain; charset=UTF-8\n"
|
||||
"Content-Transfer-Encoding: 8bit\n"
|
||||
"X-Launchpad-Export-Date: 2012-03-06 04:47+0000\n"
|
||||
"X-Generator: Launchpad (build 14900)\n"
|
||||
"X-Launchpad-Export-Date: 2012-04-22 04:43+0000\n"
|
||||
"X-Generator: Launchpad (build 15120)\n"
|
||||
"Language: de\n"
|
||||
|
||||
#. name for aaa
|
||||
msgid "Ghotuo"
|
||||
msgstr ""
|
||||
msgstr "Ghotuo (Nigeria)"
|
||||
|
||||
#. name for aab
|
||||
msgid "Alumu-Tesu"
|
||||
msgstr "Alumu-Tesu"
|
||||
msgstr "Alumu-Tesu (Nigeria)"
|
||||
|
||||
#. name for aac
|
||||
msgid "Ari"
|
||||
msgstr "Ari"
|
||||
msgstr "Ari (Papua-Neuguinea)"
|
||||
|
||||
#. name for aad
|
||||
msgid "Amal"
|
||||
@ -66,7 +66,7 @@ msgstr "Arifama-Miniafia"
|
||||
|
||||
#. name for aak
|
||||
msgid "Ankave"
|
||||
msgstr "Ankave"
|
||||
msgstr "Ankave (Papua-Neuguinea)"
|
||||
|
||||
#. name for aal
|
||||
msgid "Afade"
|
||||
@ -110,7 +110,7 @@ msgstr ""
|
||||
|
||||
#. name for aaw
|
||||
msgid "Solong"
|
||||
msgstr ""
|
||||
msgstr "Solong"
|
||||
|
||||
#. name for aax
|
||||
msgid "Mandobo Atas"
|
||||
@ -30860,7 +30860,7 @@ msgstr ""
|
||||
|
||||
#. name for zxx
|
||||
msgid "No linguistic content"
|
||||
msgstr ""
|
||||
msgstr "Kein linguistischer Inhalt"
|
||||
|
||||
#. name for zyb
|
||||
msgid "Zhuang; Yongbei"
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -9,14 +9,14 @@ msgstr ""
|
||||
"Report-Msgid-Bugs-To: Debian iso-codes team <pkg-isocodes-"
|
||||
"devel@lists.alioth.debian.org>\n"
|
||||
"POT-Creation-Date: 2011-11-25 14:01+0000\n"
|
||||
"PO-Revision-Date: 2012-03-06 13:55+0000\n"
|
||||
"PO-Revision-Date: 2012-04-18 13:08+0000\n"
|
||||
"Last-Translator: Asier Iturralde Sarasola <Unknown>\n"
|
||||
"Language-Team: Euskara <itzulpena@comtropos.com>\n"
|
||||
"MIME-Version: 1.0\n"
|
||||
"Content-Type: text/plain; charset=UTF-8\n"
|
||||
"Content-Transfer-Encoding: 8bit\n"
|
||||
"X-Launchpad-Export-Date: 2012-03-07 05:12+0000\n"
|
||||
"X-Generator: Launchpad (build 14907)\n"
|
||||
"X-Launchpad-Export-Date: 2012-04-19 04:36+0000\n"
|
||||
"X-Generator: Launchpad (build 15108)\n"
|
||||
"Language: eu\n"
|
||||
|
||||
#. name for aaa
|
||||
@ -27125,7 +27125,7 @@ msgstr ""
|
||||
|
||||
#. name for vie
|
||||
msgid "Vietnamese"
|
||||
msgstr "Mahastiak"
|
||||
msgstr "Vietnamera"
|
||||
|
||||
#. name for vif
|
||||
msgid "Vili"
|
||||
|
@ -10,14 +10,14 @@ msgstr ""
|
||||
"Report-Msgid-Bugs-To: Debian iso-codes team <pkg-isocodes-"
|
||||
"devel@lists.alioth.debian.org>\n"
|
||||
"POT-Creation-Date: 2011-11-25 14:01+0000\n"
|
||||
"PO-Revision-Date: 2011-11-11 00:16+0000\n"
|
||||
"PO-Revision-Date: 2012-04-22 07:11+0000\n"
|
||||
"Last-Translator: kulkke <Unknown>\n"
|
||||
"Language-Team: Turkish <gnome-turk@gnome.org>\n"
|
||||
"MIME-Version: 1.0\n"
|
||||
"Content-Type: text/plain; charset=UTF-8\n"
|
||||
"Content-Transfer-Encoding: 8bit\n"
|
||||
"X-Launchpad-Export-Date: 2011-11-26 05:42+0000\n"
|
||||
"X-Generator: Launchpad (build 14381)\n"
|
||||
"X-Launchpad-Export-Date: 2012-04-23 04:45+0000\n"
|
||||
"X-Generator: Launchpad (build 15135)\n"
|
||||
"Language: tr\n"
|
||||
|
||||
#. name for aaa
|
||||
@ -7371,7 +7371,7 @@ msgstr ""
|
||||
|
||||
#. name for est
|
||||
msgid "Estonian"
|
||||
msgstr "Estonyaca"
|
||||
msgstr "Estonca"
|
||||
|
||||
#. name for esu
|
||||
msgid "Yupik; Central"
|
||||
@ -11131,7 +11131,7 @@ msgstr ""
|
||||
|
||||
#. name for kaz
|
||||
msgid "Kazakh"
|
||||
msgstr "Kazak Dili"
|
||||
msgstr "Kazakça"
|
||||
|
||||
#. name for kba
|
||||
msgid "Kalarko"
|
||||
@ -13767,7 +13767,7 @@ msgstr ""
|
||||
|
||||
#. name for lav
|
||||
msgid "Latvian"
|
||||
msgstr "Letonyaca"
|
||||
msgstr "Letonca"
|
||||
|
||||
#. name for law
|
||||
msgid "Lauje"
|
||||
@ -16031,7 +16031,7 @@ msgstr ""
|
||||
|
||||
#. name for mkd
|
||||
msgid "Macedonian"
|
||||
msgstr "Makedonyaca"
|
||||
msgstr "Makedonca"
|
||||
|
||||
#. name for mke
|
||||
msgid "Mawchi"
|
||||
@ -22227,7 +22227,7 @@ msgstr ""
|
||||
|
||||
#. name for ron
|
||||
msgid "Romanian"
|
||||
msgstr "Romence"
|
||||
msgstr "Rumence"
|
||||
|
||||
#. name for roo
|
||||
msgid "Rotokas"
|
||||
|
@ -4,7 +4,7 @@ __license__ = 'GPL v3'
|
||||
__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
__appname__ = u'calibre'
|
||||
numeric_version = (0, 8, 46)
|
||||
numeric_version = (0, 8, 49)
|
||||
__version__ = u'.'.join(map(unicode, numeric_version))
|
||||
__author__ = u"Kovid Goyal <kovid@kovidgoyal.net>"
|
||||
|
||||
|
@ -259,7 +259,7 @@ class LRXMetadataReader(MetadataReaderPlugin):
|
||||
class MOBIMetadataReader(MetadataReaderPlugin):
|
||||
|
||||
name = 'Read MOBI metadata'
|
||||
file_types = set(['mobi', 'prc', 'azw', 'azw4', 'pobi'])
|
||||
file_types = set(['mobi', 'prc', 'azw', 'azw3', 'azw4', 'pobi'])
|
||||
description = _('Read metadata from %s files')%'MOBI'
|
||||
|
||||
def get_metadata(self, stream, ftype):
|
||||
|
@ -10,6 +10,8 @@ import cStringIO
|
||||
|
||||
from calibre.devices.usbms.driver import USBMS
|
||||
|
||||
HTC_BCDS = [0x100, 0x0222, 0x0226, 0x227, 0x228]
|
||||
|
||||
class ANDROID(USBMS):
|
||||
|
||||
name = 'Android driver'
|
||||
@ -23,23 +25,24 @@ class ANDROID(USBMS):
|
||||
|
||||
VENDOR_ID = {
|
||||
# HTC
|
||||
0x0bb4 : { 0xc02 : [0x100, 0x0227, 0x0226, 0x222],
|
||||
0xc01 : [0x100, 0x0227, 0x0226],
|
||||
0xff9 : [0x0100, 0x0227, 0x0226],
|
||||
0xc86 : [0x100, 0x0227, 0x0226, 0x222],
|
||||
0xc87 : [0x0100, 0x0227, 0x0226],
|
||||
0xc8d : [0x100, 0x0227, 0x0226, 0x222],
|
||||
0xc91 : [0x0100, 0x0227, 0x0226],
|
||||
0xc92 : [0x100, 0x0227, 0x0226, 0x222],
|
||||
0xc97 : [0x100, 0x0227, 0x0226, 0x222],
|
||||
0xc99 : [0x100, 0x0227, 0x0226, 0x222],
|
||||
0xca2 : [0x100, 0x0227, 0x0226, 0x222],
|
||||
0xca3 : [0x100, 0x0227, 0x0226, 0x222],
|
||||
0xca4 : [0x100, 0x0227, 0x0226, 0x222],
|
||||
0xca9 : [0x100, 0x0227, 0x0226, 0x222],
|
||||
0xcac : [0x100, 0x0227, 0x0226, 0x222],
|
||||
0xccf : [0x100, 0x0227, 0x0226, 0x222],
|
||||
0x2910 : [0x222],
|
||||
0x0bb4 : { 0xc02 : HTC_BCDS,
|
||||
0xc01 : HTC_BCDS,
|
||||
0xff9 : HTC_BCDS,
|
||||
0xc86 : HTC_BCDS,
|
||||
0xc87 : HTC_BCDS,
|
||||
0xc8d : HTC_BCDS,
|
||||
0xc91 : HTC_BCDS,
|
||||
0xc92 : HTC_BCDS,
|
||||
0xc97 : HTC_BCDS,
|
||||
0xc99 : HTC_BCDS,
|
||||
0xca2 : HTC_BCDS,
|
||||
0xca3 : HTC_BCDS,
|
||||
0xca4 : HTC_BCDS,
|
||||
0xca9 : HTC_BCDS,
|
||||
0xcac : HTC_BCDS,
|
||||
0xccf : HTC_BCDS,
|
||||
0x2910 : HTC_BCDS,
|
||||
0xff9 : [0x9999],
|
||||
},
|
||||
|
||||
# Eken
|
||||
@ -174,7 +177,7 @@ class ANDROID(USBMS):
|
||||
'TELECHIP', 'HUAWEI', 'T-MOBILE', 'SEMC', 'LGE', 'NVIDIA',
|
||||
'GENERIC-', 'ZTE', 'MID', 'QUALCOMM', 'PANDIGIT', 'HYSTON',
|
||||
'VIZIO', 'GOOGLE', 'FREESCAL', 'KOBO_INC', 'LENOVO', 'ROCKCHIP',
|
||||
'POCKET', 'ONDA_MID', 'ZENITHIN', 'INGENIC']
|
||||
'POCKET', 'ONDA_MID', 'ZENITHIN', 'INGENIC', 'PMID701C']
|
||||
WINDOWS_MAIN_MEM = ['ANDROID_PHONE', 'A855', 'A853', 'INC.NEXUS_ONE',
|
||||
'__UMS_COMPOSITE', '_MB200', 'MASS_STORAGE', '_-_CARD', 'SGH-I897',
|
||||
'GT-I9000', 'FILE-STOR_GADGET', 'SGH-T959_CARD', 'SGH-T959', 'SAMSUNG_ANDROID',
|
||||
@ -189,7 +192,8 @@ class ANDROID(USBMS):
|
||||
'UMS', '.K080', 'P990', 'LTE', 'MB853', 'GT-S5660_CARD', 'A107',
|
||||
'GT-I9003_CARD', 'XT912', 'FILE-CD_GADGET', 'RK29_SDK', 'MB855',
|
||||
'XT910', 'BOOK_A10', 'USB_2.0_DRIVER', 'I9100T', 'P999DW',
|
||||
'KTABLET_PC', 'INGENIC', 'GT-I9001_CARD']
|
||||
'KTABLET_PC', 'INGENIC', 'GT-I9001_CARD', 'USB_2.0_DRIVER',
|
||||
'GT-S5830L_CARD']
|
||||
WINDOWS_CARD_A_MEM = ['ANDROID_PHONE', 'GT-I9000_CARD', 'SGH-I897',
|
||||
'FILE-STOR_GADGET', 'SGH-T959_CARD', 'SGH-T959', 'SAMSUNG_ANDROID', 'GT-P1000_CARD',
|
||||
'A70S', 'A101IT', '7', 'INCREDIBLE', 'A7EB', 'SGH-T849_CARD',
|
||||
@ -197,7 +201,7 @@ class ANDROID(USBMS):
|
||||
'ANDROID_MID', 'P990_SD_CARD', '.K080', 'LTE_CARD', 'MB853',
|
||||
'A1-07___C0541A4F', 'XT912', 'MB855', 'XT910', 'BOOK_A10_CARD',
|
||||
'USB_2.0_DRIVER', 'I9100T', 'P999DW_SD_CARD', 'KTABLET_PC',
|
||||
'FILE-CD_GADGET', 'GT-I9001_CARD']
|
||||
'FILE-CD_GADGET', 'GT-I9001_CARD', 'USB_2.0_DRIVER']
|
||||
|
||||
OSX_MAIN_MEM = 'Android Device Main Memory'
|
||||
|
||||
|
@ -325,6 +325,10 @@ class KINDLE2(KINDLE):
|
||||
OPT_APNX_ACCURATE = 1
|
||||
OPT_APNX_CUST_COL = 2
|
||||
|
||||
def formats_to_scan_for(self):
|
||||
ans = USBMS.formats_to_scan_for(self) | {'azw3'}
|
||||
return ans
|
||||
|
||||
def books(self, oncard=None, end_session=True):
|
||||
bl = USBMS.books(self, oncard=oncard, end_session=end_session)
|
||||
# Read collections information
|
||||
@ -423,6 +427,8 @@ class KINDLE_FIRE(KINDLE2):
|
||||
name = 'Kindle Fire Device Interface'
|
||||
description = _('Communicate with the Kindle Fire')
|
||||
gui_name = 'Fire'
|
||||
FORMATS = list(KINDLE2.FORMATS)
|
||||
FORMATS.insert(0, 'azw3')
|
||||
|
||||
PRODUCT_ID = [0x0006]
|
||||
BCD = [0x216, 0x100]
|
||||
|
@ -298,7 +298,7 @@ class KOBO(USBMS):
|
||||
changed = False
|
||||
for i, row in enumerate(cursor):
|
||||
# self.report_progress((i+1) / float(numrows), _('Getting list of books on device...'))
|
||||
if row[3].startswith("file:///usr/local/Kobo/help/"):
|
||||
if not hasattr(row[3], 'startswith') or row[3].startswith("file:///usr/local/Kobo/help/"):
|
||||
# These are internal to the Kobo device and do not exist
|
||||
continue
|
||||
path = self.path_from_contentid(row[3], row[5], row[4], oncard)
|
||||
|
@ -86,7 +86,8 @@ class NOOK_COLOR(NOOK):
|
||||
PRODUCT_ID = [0x002, 0x003, 0x004]
|
||||
BCD = [0x216]
|
||||
|
||||
WINDOWS_MAIN_MEM = WINDOWS_CARD_A_MEM = ['EBOOK_DISK', 'NOOK_TABLET']
|
||||
WINDOWS_MAIN_MEM = WINDOWS_CARD_A_MEM = ['EBOOK_DISK', 'NOOK_TABLET',
|
||||
'NOOK_SIMPLETOUCH']
|
||||
EBOOK_DIR_MAIN = 'My Files'
|
||||
NEWS_IN_FOLDER = False
|
||||
|
||||
|
@ -307,11 +307,21 @@ class PRST1(USBMS):
|
||||
|
||||
# Work-around for Sony Bug (SD Card DB not using right SQLite sequence)
|
||||
if source_id == 1:
|
||||
# Update any existing sequence numbers in the table that aren't in the required range
|
||||
sdcard_sequence_start = '4294967296'
|
||||
query = 'UPDATE sqlite_sequence SET seq = ? WHERE seq < ?'
|
||||
t = (sdcard_sequence_start, sdcard_sequence_start,)
|
||||
cursor.execute(query, t)
|
||||
|
||||
# Insert sequence numbers for tables we will be manipulating, if they don't already exist
|
||||
query = ('INSERT INTO sqlite_sequence (name, seq) '
|
||||
'SELECT ?, ? '
|
||||
'WHERE NOT EXISTS (SELECT 1 FROM sqlite_sequence WHERE name = ?)');
|
||||
cursor.execute(query, ('books',sdcard_sequence_start,'books',))
|
||||
cursor.execute(query, ('collection',sdcard_sequence_start,'collection',))
|
||||
cursor.execute(query, ('collections',sdcard_sequence_start,'collections',))
|
||||
|
||||
|
||||
for book in booklist:
|
||||
# Run through plugboard if needed
|
||||
if plugboard is not None:
|
||||
|
@ -128,6 +128,9 @@ class USBMS(CLI, Device):
|
||||
elif location_code == 'B':
|
||||
self._update_driveinfo_file(self._card_b_prefix, location_code, name)
|
||||
|
||||
def formats_to_scan_for(self):
|
||||
return set(self.settings().format_map) | set(self.FORMATS)
|
||||
|
||||
def books(self, oncard=None, end_session=True):
|
||||
from calibre.ebooks.metadata.meta import path_to_ext
|
||||
|
||||
@ -166,7 +169,7 @@ class USBMS(CLI, Device):
|
||||
for idx,b in enumerate(bl):
|
||||
bl_cache[b.lpath] = idx
|
||||
|
||||
all_formats = set(self.settings().format_map) | set(self.FORMATS)
|
||||
all_formats = self.formats_to_scan_for()
|
||||
|
||||
def update_booklist(filename, path, prefix):
|
||||
changed = False
|
||||
|
@ -31,7 +31,7 @@ BOOK_EXTENSIONS = ['lrf', 'rar', 'zip', 'rtf', 'lit', 'txt', 'txtz', 'text', 'ht
|
||||
'epub', 'fb2', 'djv', 'djvu', 'lrx', 'cbr', 'cbz', 'cbc', 'oebzip',
|
||||
'rb', 'imp', 'odt', 'chm', 'tpz', 'azw1', 'pml', 'pmlz', 'mbp', 'tan', 'snb',
|
||||
'xps', 'oxps', 'azw4', 'book', 'zbf', 'pobi', 'docx', 'md',
|
||||
'textile', 'markdown', 'ibook', 'iba']
|
||||
'textile', 'markdown', 'ibook', 'iba', 'azw3']
|
||||
|
||||
class HTMLRenderer(object):
|
||||
|
||||
@ -93,6 +93,20 @@ def extract_calibre_cover(raw, base, log):
|
||||
if os.path.exists(img):
|
||||
return open(img, 'rb').read()
|
||||
|
||||
# Look for a simple cover, i.e. a body with no text and only one <img> tag
|
||||
if matches is None:
|
||||
body = soup.find('body')
|
||||
if body is not None:
|
||||
text = u''.join(map(unicode, body.findAll(text=True)))
|
||||
if text.strip():
|
||||
# Body has text, abort
|
||||
return
|
||||
images = body.findAll('img', src=True)
|
||||
if 0 < len(images) < 2:
|
||||
img = os.path.join(base, *images[0]['src'].split('/'))
|
||||
if os.path.exists(img):
|
||||
return open(img, 'rb').read()
|
||||
|
||||
def render_html_svg_workaround(path_to_html, log, width=590, height=750):
|
||||
from calibre.ebooks.oeb.base import SVG_NS
|
||||
raw = open(path_to_html, 'rb').read()
|
||||
@ -108,6 +122,7 @@ def render_html_svg_workaround(path_to_html, log, width=590, height=750):
|
||||
data = extract_calibre_cover(raw, os.path.dirname(path_to_html), log)
|
||||
except:
|
||||
pass
|
||||
|
||||
if data is None:
|
||||
renderer = render_html(path_to_html, width, height)
|
||||
data = getattr(renderer, 'data', None)
|
||||
|
@ -156,9 +156,10 @@ def add_pipeline_options(parser, plumber):
|
||||
'SEARCH AND REPLACE' : (
|
||||
_('Modify the document text and structure using user defined patterns.'),
|
||||
[
|
||||
'sr1_search', 'sr1_replace',
|
||||
'sr2_search', 'sr2_replace',
|
||||
'sr3_search', 'sr3_replace',
|
||||
'sr1_search', 'sr1_replace',
|
||||
'sr2_search', 'sr2_replace',
|
||||
'sr3_search', 'sr3_replace',
|
||||
'search_replace',
|
||||
]
|
||||
),
|
||||
|
||||
@ -211,6 +212,7 @@ def add_pipeline_options(parser, plumber):
|
||||
if rec.level < rec.HIGH:
|
||||
option_recommendation_to_cli_option(add_option, rec)
|
||||
|
||||
|
||||
def option_parser():
|
||||
parser = OptionParser(usage=USAGE)
|
||||
parser.add_option('--list-recipes', default=False, action='store_true',
|
||||
@ -271,6 +273,34 @@ def abspath(x):
|
||||
return x
|
||||
return os.path.abspath(os.path.expanduser(x))
|
||||
|
||||
def read_sr_patterns(path, log=None):
|
||||
import json, re, codecs
|
||||
pats = []
|
||||
with codecs.open(path, 'r', 'utf-8') as f:
|
||||
pat = None
|
||||
for line in f.readlines():
|
||||
if line.endswith(u'\n'):
|
||||
line = line[:-1]
|
||||
|
||||
if pat is None:
|
||||
if not line.strip():
|
||||
continue
|
||||
try:
|
||||
re.compile(line)
|
||||
except:
|
||||
msg = u'Invalid regular expression: %r from file: %r'%(
|
||||
line, path)
|
||||
if log is not None:
|
||||
log.error(msg)
|
||||
raise SystemExit(1)
|
||||
else:
|
||||
raise ValueError(msg)
|
||||
pat = line
|
||||
else:
|
||||
pats.append((pat, line))
|
||||
pat = None
|
||||
return json.dumps(pats)
|
||||
|
||||
def main(args=sys.argv):
|
||||
log = Log()
|
||||
parser, plumber = create_option_parser(args, log)
|
||||
@ -278,6 +308,9 @@ def main(args=sys.argv):
|
||||
for x in ('read_metadata_from_opf', 'cover'):
|
||||
if getattr(opts, x, None) is not None:
|
||||
setattr(opts, x, abspath(getattr(opts, x)))
|
||||
if opts.search_replace:
|
||||
opts.search_replace = read_sr_patterns(opts.search_replace, log)
|
||||
|
||||
recommendations = [(n.dest, getattr(opts, n.dest),
|
||||
OptionRecommendation.HIGH) \
|
||||
for n in parser.options_iter()
|
||||
|
@ -7,41 +7,17 @@ import os
|
||||
|
||||
from calibre.customize.conversion import InputFormatPlugin
|
||||
|
||||
def run_mobi_unpack(stream, options, log, accelerators):
|
||||
from mobiunpack.mobi_unpack import Mobi8Reader
|
||||
from calibre.customize.ui import plugin_for_input_format
|
||||
from calibre.ptempfile import PersistentTemporaryDirectory
|
||||
|
||||
wdir = PersistentTemporaryDirectory('_unpack_space')
|
||||
m8r = Mobi8Reader(stream, wdir)
|
||||
if m8r.isK8():
|
||||
epub_path = m8r.processMobi8()
|
||||
epub_input = plugin_for_input_format('epub')
|
||||
for opt in epub_input.options:
|
||||
setattr(options, opt.option.name, opt.recommended_value)
|
||||
options.input_encoding = m8r.getCodec()
|
||||
return epub_input.convert(open(epub_path,'rb'), options,
|
||||
'epub', log, accelerators)
|
||||
|
||||
class MOBIInput(InputFormatPlugin):
|
||||
|
||||
name = 'MOBI Input'
|
||||
author = 'Kovid Goyal'
|
||||
description = 'Convert MOBI files (.mobi, .prc, .azw) to HTML'
|
||||
file_types = set(['mobi', 'prc', 'azw'])
|
||||
file_types = set(['mobi', 'prc', 'azw', 'azw3'])
|
||||
|
||||
def convert(self, stream, options, file_ext, log,
|
||||
accelerators):
|
||||
self.is_kf8 = False
|
||||
|
||||
if os.environ.get('USE_MOBIUNPACK', None) is not None:
|
||||
pos = stream.tell()
|
||||
try:
|
||||
return run_mobi_unpack(stream, options, log, accelerators)
|
||||
except Exception:
|
||||
log.exception('mobi_unpack code not working')
|
||||
stream.seek(pos)
|
||||
|
||||
from calibre.ebooks.mobi.reader.mobi6 import MobiReader
|
||||
from lxml import html
|
||||
parse_cache = {}
|
||||
|
@ -6,8 +6,6 @@ __license__ = 'GPL v3'
|
||||
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
from cStringIO import StringIO
|
||||
|
||||
from calibre.customize.conversion import OutputFormatPlugin
|
||||
from calibre.customize.conversion import OptionRecommendation
|
||||
|
||||
@ -79,18 +77,9 @@ class MOBIOutput(OutputFormatPlugin):
|
||||
def check_for_masthead(self):
|
||||
found = 'masthead' in self.oeb.guide
|
||||
if not found:
|
||||
from calibre.ebooks import generate_masthead
|
||||
self.oeb.log.debug('No masthead found in manifest, generating default mastheadImage...')
|
||||
try:
|
||||
from PIL import Image as PILImage
|
||||
PILImage
|
||||
except ImportError:
|
||||
import Image as PILImage
|
||||
|
||||
raw = open(P('content_server/calibre_banner.png'), 'rb')
|
||||
im = PILImage.open(raw)
|
||||
of = StringIO()
|
||||
im.save(of, 'GIF')
|
||||
raw = of.getvalue()
|
||||
raw = generate_masthead(unicode(self.oeb.metadata['title'][0]))
|
||||
id, href = self.oeb.manifest.generate('masthead', 'masthead')
|
||||
self.oeb.manifest.add(id, href, 'image/gif', data=raw)
|
||||
self.oeb.guide.add('masthead', 'Masthead Image', href)
|
||||
@ -151,13 +140,70 @@ class MOBIOutput(OutputFormatPlugin):
|
||||
# Fix up the periodical href to point to first section href
|
||||
toc.nodes[0].href = toc.nodes[0].nodes[0].href
|
||||
|
||||
def remove_html_cover(self):
|
||||
from calibre.ebooks.oeb.base import OEB_DOCS
|
||||
|
||||
oeb = self.oeb
|
||||
if not oeb.metadata.cover \
|
||||
or 'cover' not in oeb.guide:
|
||||
return
|
||||
href = oeb.guide['cover'].href
|
||||
del oeb.guide['cover']
|
||||
item = oeb.manifest.hrefs[href]
|
||||
if item.spine_position is not None:
|
||||
self.log.warn('Found an HTML cover: ', item.href, 'removing it.',
|
||||
'If you find some content missing from the output MOBI, it '
|
||||
'is because you misidentified the HTML cover in the input '
|
||||
'document')
|
||||
oeb.spine.remove(item)
|
||||
if item.media_type in OEB_DOCS:
|
||||
self.oeb.manifest.remove(item)
|
||||
|
||||
def convert(self, oeb, output_path, input_plugin, opts, log):
|
||||
from calibre.utils.config import tweaks
|
||||
from calibre.ebooks.mobi.writer2.resources import Resources
|
||||
self.log, self.opts, self.oeb = log, opts, oeb
|
||||
|
||||
mobi_type = tweaks.get('test_mobi_output_type', 'old')
|
||||
if self.is_periodical:
|
||||
mobi_type = 'old' # Amazon does not support KF8 periodicals
|
||||
create_kf8 = mobi_type in ('new', 'both')
|
||||
|
||||
self.remove_html_cover()
|
||||
resources = Resources(oeb, opts, self.is_periodical,
|
||||
add_fonts=create_kf8)
|
||||
self.check_for_periodical()
|
||||
|
||||
if create_kf8:
|
||||
# Split on pagebreaks so that the resulting KF8 works better with
|
||||
# calibre's viewer, which does not support CSS page breaks
|
||||
from calibre.ebooks.oeb.transforms.split import Split
|
||||
Split()(self.oeb, self.opts)
|
||||
|
||||
|
||||
kf8 = self.create_kf8(resources, for_joint=mobi_type=='both'
|
||||
) if create_kf8 else None
|
||||
if mobi_type == 'new':
|
||||
kf8.write(output_path)
|
||||
self.extract_mobi(output_path, opts)
|
||||
return
|
||||
|
||||
self.log('Creating MOBI 6 output')
|
||||
self.write_mobi(input_plugin, output_path, kf8, resources)
|
||||
|
||||
def create_kf8(self, resources, for_joint=False):
|
||||
from calibre.ebooks.mobi.writer8.main import create_kf8_book
|
||||
return create_kf8_book(self.oeb, self.opts, resources,
|
||||
for_joint=for_joint)
|
||||
|
||||
def write_mobi(self, input_plugin, output_path, kf8, resources):
|
||||
from calibre.ebooks.mobi.mobiml import MobiMLizer
|
||||
from calibre.ebooks.oeb.transforms.manglecase import CaseMangler
|
||||
from calibre.ebooks.oeb.transforms.rasterize import SVGRasterizer, Unavailable
|
||||
from calibre.ebooks.oeb.transforms.htmltoc import HTMLTOCAdder
|
||||
from calibre.customize.ui import plugin_for_input_format
|
||||
|
||||
opts, oeb = self.opts, self.oeb
|
||||
if not opts.no_inline_toc:
|
||||
tocadder = HTMLTOCAdder(title=opts.toc_title, position='start' if
|
||||
opts.mobi_toc_at_start else 'end')
|
||||
@ -169,15 +215,19 @@ class MOBIOutput(OutputFormatPlugin):
|
||||
rasterizer(oeb, opts)
|
||||
except Unavailable:
|
||||
self.log.warn('SVG rasterizer unavailable, SVG will not be converted')
|
||||
else:
|
||||
# Add rasterized SVG images
|
||||
resources.add_extra_images()
|
||||
mobimlizer = MobiMLizer(ignore_tables=opts.linearize_tables)
|
||||
mobimlizer(oeb, opts)
|
||||
self.check_for_periodical()
|
||||
write_page_breaks_after_item = input_plugin is not plugin_for_input_format('cbz')
|
||||
from calibre.ebooks.mobi.writer2.main import MobiWriter
|
||||
writer = MobiWriter(opts,
|
||||
writer = MobiWriter(opts, resources, kf8,
|
||||
write_page_breaks_after_item=write_page_breaks_after_item)
|
||||
writer(oeb, output_path)
|
||||
self.extract_mobi(output_path, opts)
|
||||
|
||||
def extract_mobi(self, output_path, opts):
|
||||
if opts.extract_to is not None:
|
||||
from calibre.ebooks.mobi.debug.main import inspect_mobi
|
||||
ddir = opts.extract_to
|
||||
|
@ -536,7 +536,7 @@ OptionRecommendation(name='pubdate',
|
||||
|
||||
OptionRecommendation(name='timestamp',
|
||||
recommended_value=None, level=OptionRecommendation.LOW,
|
||||
help=_('Set the book timestamp (used by the date column in calibre).')),
|
||||
help=_('Set the book timestamp (no longer used anywhere)')),
|
||||
|
||||
OptionRecommendation(name='enable_heuristics',
|
||||
recommended_value=False, level=OptionRecommendation.LOW,
|
||||
@ -626,6 +626,14 @@ OptionRecommendation(name='sr3_search',
|
||||
OptionRecommendation(name='sr3_replace',
|
||||
recommended_value='', level=OptionRecommendation.LOW,
|
||||
help=_('Replacement to replace the text found with sr3-search.')),
|
||||
|
||||
OptionRecommendation(name='search_replace',
|
||||
recommended_value=None, level=OptionRecommendation.LOW, help=_(
|
||||
'Path to a file containing search and replace regular expressions. '
|
||||
'The file must contain alternating lines of regular expression '
|
||||
'followed by replacement pattern (which can be an empty line). '
|
||||
'The regular expression must be in the python regex syntax and '
|
||||
'the file must be UTF-8 encoded.')),
|
||||
]
|
||||
# }}}
|
||||
|
||||
|
@ -5,7 +5,7 @@ __license__ = 'GPL v3'
|
||||
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import functools, re
|
||||
import functools, re, json
|
||||
|
||||
from calibre import entity_to_unicode, as_unicode
|
||||
|
||||
@ -515,18 +515,31 @@ class HTMLPreProcessor(object):
|
||||
if not getattr(self.extra_opts, 'keep_ligatures', False):
|
||||
html = _ligpat.sub(lambda m:LIGATURES[m.group()], html)
|
||||
|
||||
for search, replace in [['sr3_search', 'sr3_replace'], ['sr2_search', 'sr2_replace'], ['sr1_search', 'sr1_replace']]:
|
||||
# Function for processing search and replace
|
||||
def do_search_replace(search_pattern, replace_txt):
|
||||
try:
|
||||
search_re = re.compile(search_pattern)
|
||||
if not replace_txt:
|
||||
replace_txt = ''
|
||||
rules.insert(0, (search_re, replace_txt))
|
||||
except Exception as e:
|
||||
self.log.error('Failed to parse %r regexp because %s' %
|
||||
(search, as_unicode(e)))
|
||||
|
||||
# search / replace using the sr?_search / sr?_replace options
|
||||
for i in range(1, 4):
|
||||
search, replace = 'sr%d_search'%i, 'sr%d_replace'%i
|
||||
search_pattern = getattr(self.extra_opts, search, '')
|
||||
replace_txt = getattr(self.extra_opts, replace, '')
|
||||
if search_pattern:
|
||||
try:
|
||||
search_re = re.compile(search_pattern)
|
||||
replace_txt = getattr(self.extra_opts, replace, '')
|
||||
if not replace_txt:
|
||||
replace_txt = ''
|
||||
rules.insert(0, (search_re, replace_txt))
|
||||
except Exception as e:
|
||||
self.log.error('Failed to parse %r regexp because %s' %
|
||||
(search, as_unicode(e)))
|
||||
do_search_replace(search_pattern, replace_txt)
|
||||
|
||||
# multi-search / replace using the search_replace option
|
||||
search_replace = getattr(self.extra_opts, 'search_replace', None)
|
||||
if search_replace:
|
||||
search_replace = json.loads(search_replace)
|
||||
for search_pattern, replace_txt in search_replace:
|
||||
do_search_replace(search_pattern, replace_txt)
|
||||
|
||||
end_rules = []
|
||||
# delete soft hyphens - moved here so it's executed after header/footer removal
|
||||
@ -546,7 +559,7 @@ class HTMLPreProcessor(object):
|
||||
end_rules.append((re.compile(u'(?<=.{%i}[–—])\s*<p>\s*(?=[[a-z\d])' % length), lambda match: ''))
|
||||
end_rules.append(
|
||||
# Un wrap using punctuation
|
||||
(re.compile(u'(?<=.{%i}([a-zäëïöüàèìòùáćéíĺóŕńśúýâêîôûçąężıãõñæøþðßěľščťžňďřů,:“”)\IA\u00DF]|(?<!\&\w{4});))\s*(?P<ital></(i|b|u)>)?\s*(</p>\s*<p>\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines),
|
||||
(re.compile(u'(?<=.{%i}([a-zäëïöüàèìòùáćéíĺóŕńśúýâêîôûçąężıãõñæøþðßěľščťžňďřů,:)\IA\u00DF]|(?<!\&\w{4});))\s*(?P<ital></(i|b|u)>)?\s*(</p>\s*<p>\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines),
|
||||
)
|
||||
|
||||
for rule in self.PREPROCESS + start_rules:
|
||||
|
@ -148,6 +148,7 @@ class HeuristicProcessor(object):
|
||||
return wordcount.words
|
||||
|
||||
def markup_italicis(self, html):
|
||||
self.log.debug("\n\n\nitalicize debugging \n\n\n")
|
||||
ITALICIZE_WORDS = [
|
||||
'Etc.', 'etc.', 'viz.', 'ie.', 'i.e.', 'Ie.', 'I.e.', 'eg.',
|
||||
'e.g.', 'Eg.', 'E.g.', 'et al.', 'et cetera', 'n.b.', 'N.b.',
|
||||
@ -156,28 +157,30 @@ class HeuristicProcessor(object):
|
||||
]
|
||||
|
||||
ITALICIZE_STYLE_PATS = [
|
||||
ur'(?msu)(?<=[\s>"“\'‘])_(?P<words>[^_]+)_',
|
||||
ur'(?msu)(?<=[\s>"“\'‘])/(?P<words>[^/\*><]+)/',
|
||||
ur'(?msu)(?<=[\s>"“\'‘])_\*/(?P<words>[^\*_]+)/\*_',
|
||||
ur'(?msu)(?<=[\s>"“\'‘])~~(?P<words>[^~]+)~~',
|
||||
ur'(?msu)(?<=[\s>"“\'‘])\*(?P<words>[^\*]+)\*',
|
||||
ur'(?msu)(?<=[\s>"“\'‘])~(?P<words>[^~]+)~',
|
||||
ur'(?msu)(?<=[\s>"“\'‘])_/(?P<words>[^/_]+)/_',
|
||||
ur'(?msu)(?<=[\s>"“\'‘])_\*(?P<words>[^\*_]+)\*_',
|
||||
ur'(?msu)(?<=[\s>"“\'‘])\*/(?P<words>[^/\*]+)/\*',
|
||||
ur'(?msu)(?<=[\s>"“\'‘])_\*/(?P<words>[^\*_]+)/\*_',
|
||||
ur'(?msu)(?<=[\s>"“\'‘])/:(?P<words>[^:/]+):/',
|
||||
ur'(?msu)(?<=[\s>"“\'‘])\|:(?P<words>[^:\|]+):\|',
|
||||
ur'(?msu)(?<=[\s>"“\'‘])\*(?P<words>[^\*]+)\*',
|
||||
ur'(?msu)(?<=[\s>"“\'‘])~(?P<words>[^~]+)~',
|
||||
ur'(?msu)(?<=[\s>"“\'‘])/(?P<words>[^/\*><]+)/',
|
||||
ur'(?msu)(?<=[\s>"“\'‘])_(?P<words>[^_]+)_'
|
||||
]
|
||||
|
||||
for word in ITALICIZE_WORDS:
|
||||
html = re.sub(r'(?<=\s|>)' + re.escape(word) + r'(?=\s|<)', '<i>%s</i>' % word, html)
|
||||
|
||||
def sub(mo):
|
||||
return '<i>%s</i>'%mo.group('words')
|
||||
|
||||
search_text = re.sub(r'(?s)<head[^>]*>.*?</head>', '', html)
|
||||
search_text = re.sub(r'<[^>]*>', '', search_text)
|
||||
for pat in ITALICIZE_STYLE_PATS:
|
||||
html = re.sub(pat, sub, html)
|
||||
|
||||
for match in re.finditer(pat, search_text):
|
||||
ital_string = str(match.group('words'))
|
||||
#self.log.debug("italicising "+str(match.group(0))+" with <i>"+ital_string+"</i>")
|
||||
html = re.sub(re.escape(str(match.group(0))), '<i>%s</i>' % ital_string, html)
|
||||
|
||||
return html
|
||||
|
||||
def markup_chapters(self, html, wordcount, blanks_between_paragraphs):
|
||||
@ -316,13 +319,20 @@ class HeuristicProcessor(object):
|
||||
'''
|
||||
Unwraps lines based on line length and punctuation
|
||||
supports a range of html markup and text files
|
||||
|
||||
the lookahead regex below is meant look for any non-full stop characters - punctuation
|
||||
characters which can be used as a full stop should *not* be added below - e.g. ?!“”. etc
|
||||
the reason for this is to prevent false positive wrapping. False positives are more
|
||||
difficult to detect than false negatives during a manual review of the doc
|
||||
|
||||
This function intentionally leaves hyphenated content alone as that is handled by the
|
||||
dehyphenate routine in a separate step
|
||||
'''
|
||||
# define the pieces of the regex
|
||||
|
||||
lookahead = "(?<=.{"+str(length)+u"}([a-zäëïöüàèìòùáćéíĺóŕńśúýâêîôûçąężıãõñæøþðßěľščťžňďřů,:“”)\IA\u00DF]|(?<!\&\w{4});))" # (?<!\&\w{4});) is a semicolon not part of an entity
|
||||
# define the pieces of the regex
|
||||
lookahead = "(?<=.{"+str(length)+u"}([a-zäëïöüàèìòùáćéíĺóŕńśúýâêîôûçąężıãõñæøþðßěľščťžňďřů,:)\IA\u00DF]|(?<!\&\w{4});))" # (?<!\&\w{4});) is a semicolon not part of an entity
|
||||
em_en_lookahead = "(?<=.{"+str(length)+u"}[\u2013\u2014])"
|
||||
soft_hyphen = u"\xad"
|
||||
dash = u"\x2d" # some ocrs doesn't convert dashes to hyphens
|
||||
line_ending = "\s*</(span|[iubp]|div)>\s*(</(span|[iubp]|div)>)?"
|
||||
blanklines = "\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*"
|
||||
line_opening = "<(span|[iubp]|div)[^>]*>\s*(<(span|[iubp]|div)[^>]*>)?\s*"
|
||||
@ -331,23 +341,19 @@ class HeuristicProcessor(object):
|
||||
unwrap_regex = lookahead+line_ending+blanklines+line_opening
|
||||
em_en_unwrap_regex = em_en_lookahead+line_ending+blanklines+line_opening
|
||||
shy_unwrap_regex = soft_hyphen+line_ending+blanklines+line_opening
|
||||
dash_unwrap_regex = dash+line_ending+blanklines+line_opening
|
||||
|
||||
if format == 'txt':
|
||||
unwrap_regex = lookahead+txt_line_wrap
|
||||
em_en_unwrap_regex = em_en_lookahead+txt_line_wrap
|
||||
shy_unwrap_regex = soft_hyphen+txt_line_wrap
|
||||
dash_unwrap_regex = dash+txt_line_wrap
|
||||
|
||||
unwrap = re.compile(u"%s" % unwrap_regex, re.UNICODE)
|
||||
em_en_unwrap = re.compile(u"%s" % em_en_unwrap_regex, re.UNICODE)
|
||||
shy_unwrap = re.compile(u"%s" % shy_unwrap_regex, re.UNICODE)
|
||||
dash_unwrap = re.compile(u"%s" % dash_unwrap_regex, re.UNICODE)
|
||||
|
||||
content = unwrap.sub(' ', content)
|
||||
content = em_en_unwrap.sub('', content)
|
||||
content = shy_unwrap.sub('', content)
|
||||
content = dash_unwrap.sub('', content)
|
||||
return content
|
||||
|
||||
def txt_process(self, match):
|
||||
@ -460,27 +466,31 @@ class HeuristicProcessor(object):
|
||||
return html
|
||||
|
||||
def detect_whitespace(self, html):
|
||||
blanks_around_headings = re.compile(r'(?P<initparas>(<(p|div)[^>]*>\s*</(p|div)>\s*){1,}\s*)?(?P<heading><h(?P<hnum>\d+)[^>]*>.*?</h(?P=hnum)>)(?P<endparas>\s*(<(p|div)[^>]*>\s*</(p|div)>\s*){1,})?', re.IGNORECASE|re.DOTALL)
|
||||
blanks_around_headings = re.compile(r'(?P<initparas>(<(p|div)[^>]*>\s*</(p|div)>\s*){1,}\s*)?(?P<content><h(?P<hnum>\d+)[^>]*>.*?</h(?P=hnum)>)(?P<endparas>\s*(<(p|div)[^>]*>\s*</(p|div)>\s*){1,})?', re.IGNORECASE|re.DOTALL)
|
||||
blanks_around_scene_breaks = re.compile(r'(?P<initparas>(<(p|div)[^>]*>\s*</(p|div)>\s*){1,}\s*)?(?P<content><p class="scenebreak"[^>]*>.*?</p>)(?P<endparas>\s*(<(p|div)[^>]*>\s*</(p|div)>\s*){1,})?', re.IGNORECASE|re.DOTALL)
|
||||
blanks_n_nopunct = re.compile(r'(?P<initparas>(<p[^>]*>\s*</p>\s*){1,}\s*)?<p[^>]*>\s*(<(span|[ibu]|em|strong|font)[^>]*>\s*)*.{1,100}?[^\W](</(span|[ibu]|em|strong|font)>\s*)*</p>(?P<endparas>\s*(<p[^>]*>\s*</p>\s*){1,})?', re.IGNORECASE|re.DOTALL)
|
||||
|
||||
def merge_header_whitespace(match):
|
||||
initblanks = match.group('initparas')
|
||||
endblanks = match.group('initparas')
|
||||
heading = match.group('heading')
|
||||
endblanks = match.group('endparas')
|
||||
content = match.group('content')
|
||||
top_margin = ''
|
||||
bottom_margin = ''
|
||||
if initblanks is not None:
|
||||
top_margin = 'margin-top:'+str(len(self.single_blank.findall(initblanks)))+'em;'
|
||||
if endblanks is not None:
|
||||
bottom_margin = 'margin-bottom:'+str(len(self.single_blank.findall(initblanks)))+'em;'
|
||||
bottom_margin = 'margin-bottom:'+str(len(self.single_blank.findall(endblanks)))+'em;'
|
||||
|
||||
if initblanks == None and endblanks == None:
|
||||
return heading
|
||||
return content
|
||||
elif content.find('scenebreak') != -1:
|
||||
return content
|
||||
else:
|
||||
heading = re.sub('(?i)<h(?P<hnum>\d+)[^>]*>', '\n\n<h'+'\g<hnum>'+' style="'+top_margin+bottom_margin+'">', heading)
|
||||
return heading
|
||||
content = re.sub('(?i)<h(?P<hnum>\d+)[^>]*>', '\n\n<h'+'\g<hnum>'+' style="'+top_margin+bottom_margin+'">', content)
|
||||
return content
|
||||
|
||||
html = blanks_around_headings.sub(merge_header_whitespace, html)
|
||||
html = blanks_around_scene_breaks.sub(merge_header_whitespace, html)
|
||||
|
||||
def markup_whitespaces(match):
|
||||
blanks = match.group(0)
|
||||
@ -515,6 +525,12 @@ class HeuristicProcessor(object):
|
||||
html = self.blankreg.sub('\n<p class="softbreak" style="margin-top:.5em; page-break-before:avoid; text-align:center"> </p>', html)
|
||||
return html
|
||||
|
||||
def detect_scene_breaks(self, html):
|
||||
scene_break_regex = self.line_open+'(?!('+self.common_in_text_beginnings+'|.*?'+self.common_in_text_endings+'<))(?P<break>((?P<break_char>((?!\s)\W))\s*(?P=break_char)?)+)\s*'+self.line_close
|
||||
scene_breaks = re.compile(r'%s' % scene_break_regex, re.IGNORECASE|re.UNICODE)
|
||||
html = scene_breaks.sub(self.scene_break_open+'\g<break>'+'</p>', html)
|
||||
return html
|
||||
|
||||
def markup_user_break(self, replacement_break):
|
||||
'''
|
||||
Takes string a user supplies and wraps it in markup that will be centered with
|
||||
@ -781,25 +797,25 @@ class HeuristicProcessor(object):
|
||||
if getattr(self.extra_opts, 'format_scene_breaks', False):
|
||||
self.log.debug('Formatting scene breaks')
|
||||
html = re.sub('(?i)<div[^>]*>\s*<br(\s?/)?>\s*</div>', '<p></p>', html)
|
||||
html = self.detect_scene_breaks(html)
|
||||
html = self.detect_whitespace(html)
|
||||
html = self.detect_soft_breaks(html)
|
||||
blanks_count = len(self.any_multi_blank.findall(html))
|
||||
if blanks_count >= 1:
|
||||
html = self.merge_blanks(html, blanks_count)
|
||||
scene_break_regex = self.line_open+'(?!('+self.common_in_text_beginnings+'|.*?'+self.common_in_text_endings+'<))(?P<break>((?P<break_char>((?!\s)\W))\s*(?P=break_char)?)+)\s*'+self.line_close
|
||||
scene_break = re.compile(r'%s' % scene_break_regex, re.IGNORECASE|re.UNICODE)
|
||||
detected_scene_break = re.compile(r'<p class="scenebreak"[^>]*>.*?</p>')
|
||||
scene_break_count = len(detected_scene_break.findall(html))
|
||||
# If the user has enabled scene break replacement, then either softbreaks
|
||||
# or 'hard' scene breaks are replaced, depending on which is in use
|
||||
# Otherwise separator lines are centered, use a bit larger margin in this case
|
||||
replacement_break = getattr(self.extra_opts, 'replace_scene_breaks', None)
|
||||
if replacement_break:
|
||||
replacement_break = self.markup_user_break(replacement_break)
|
||||
if len(scene_break.findall(html)) >= 1:
|
||||
html = scene_break.sub(replacement_break, html)
|
||||
if scene_break_count >= 1:
|
||||
html = detected_scene_break.sub(replacement_break, html)
|
||||
html = re.sub('<p\s+class="softbreak"[^>]*>\s*</p>', replacement_break, html)
|
||||
else:
|
||||
html = re.sub('<p\s+class="softbreak"[^>]*>\s*</p>', replacement_break, html)
|
||||
else:
|
||||
html = scene_break.sub(self.scene_break_open+'\g<break>'+'</p>', html)
|
||||
|
||||
if self.deleted_nbsps:
|
||||
# put back non-breaking spaces in empty paragraphs so they render correctly
|
||||
|
@ -18,6 +18,7 @@ from lxml import etree
|
||||
from calibre import prepare_string_for_xml
|
||||
from calibre.constants import __appname__, __version__
|
||||
from calibre.utils.magick import Image
|
||||
from calibre.utils.localization import lang_as_iso639_1
|
||||
|
||||
class FB2MLizer(object):
|
||||
'''
|
||||
@ -103,7 +104,10 @@ class FB2MLizer(object):
|
||||
metadata['version'] = __version__
|
||||
metadata['date'] = '%i.%i.%i' % (datetime.now().day, datetime.now().month, datetime.now().year)
|
||||
if self.oeb_book.metadata.language:
|
||||
metadata['lang'] = self.oeb_book.metadata.language[0].value
|
||||
lc = lang_as_iso639_1(self.oeb_book.metadata.language[0].value)
|
||||
if not lc:
|
||||
lc = self.oeb_book.metadata.language[0].value
|
||||
metadata['lang'] = lc or 'en'
|
||||
else:
|
||||
metadata['lang'] = u'en'
|
||||
metadata['id'] = None
|
||||
|
@ -197,14 +197,18 @@ class OverDrive(Source):
|
||||
title_tokens = list(self.get_title_tokens(title,
|
||||
strip_joiners=False, strip_subtitle=True))
|
||||
|
||||
if len(title_tokens) >= len(author_tokens):
|
||||
xref_q = ''
|
||||
if len(author_tokens) <= 1:
|
||||
initial_q = ' '.join(title_tokens)
|
||||
xref_q = '+'.join(author_tokens)
|
||||
else:
|
||||
initial_q = ' '.join(author_tokens)
|
||||
xref_q = '+'.join(title_tokens)
|
||||
#log.error('Initial query is %s'%initial_q)
|
||||
#log.error('Cross reference query is %s'%xref_q)
|
||||
for token in title_tokens:
|
||||
if len(xref_q) < len(token):
|
||||
xref_q = token
|
||||
|
||||
log.error('Initial query is %s'%initial_q)
|
||||
log.error('Cross reference query is %s'%xref_q)
|
||||
|
||||
q_xref = q+'SearchResults.svc/GetResults?iDisplayLength=50&sSearch='+xref_q
|
||||
query = '{"szKeyword":"'+initial_q+'"}'
|
||||
@ -219,27 +223,30 @@ class OverDrive(Source):
|
||||
|
||||
# get the search results object
|
||||
results = False
|
||||
iterations = 0
|
||||
while results == False:
|
||||
iterations += 1
|
||||
xreq = mechanize.Request(q_xref)
|
||||
xreq.add_header('X-Requested-With', 'XMLHttpRequest')
|
||||
xreq.add_header('Referer', q_init_search)
|
||||
xreq.add_header('Accept', 'application/json, text/javascript, */*')
|
||||
raw = br.open_novisit(xreq).read()
|
||||
for m in re.finditer(ur'"iTotalDisplayRecords":(?P<displayrecords>\d+).*?"iTotalRecords":(?P<totalrecords>\d+)', raw):
|
||||
if int(m.group('displayrecords')) >= 1:
|
||||
results = True
|
||||
elif int(m.group('totalrecords')) >= 1:
|
||||
if int(m.group('totalrecords')) >= 100:
|
||||
if xref_q.find('+') != -1:
|
||||
xref_tokens = xref_q.split('+')
|
||||
xref_q = xref_tokens[0]
|
||||
#log.error('xref_q is '+xref_q)
|
||||
else:
|
||||
xref_q = ''
|
||||
xref_q = ''
|
||||
q_xref = q+'SearchResults.svc/GetResults?iDisplayLength=50&sSearch='+xref_q
|
||||
elif int(m.group('totalrecords')) == 0:
|
||||
if int(m.group('totalrecords')) == 0:
|
||||
return ''
|
||||
elif int(m.group('displayrecords')) >= 1:
|
||||
results = True
|
||||
elif int(m.group('totalrecords')) >= 1 and iterations < 3:
|
||||
if xref_q.find('+') != -1:
|
||||
xref_tokens = xref_q.split('+')
|
||||
xref_q = xref_tokens[0]
|
||||
for token in xref_tokens:
|
||||
if len(xref_q) < len(token):
|
||||
xref_q = token
|
||||
#log.error('rewrote xref_q, new query is '+xref_q)
|
||||
else:
|
||||
xref_q = ''
|
||||
q_xref = q+'SearchResults.svc/GetResults?iDisplayLength=50&sSearch='+xref_q
|
||||
|
||||
return self.sort_ovrdrv_results(raw, log, title, title_tokens, author, author_tokens)
|
||||
|
||||
@ -263,6 +270,7 @@ class OverDrive(Source):
|
||||
else:
|
||||
if creators:
|
||||
creators = creators.split(', ')
|
||||
|
||||
# if an exact match in a preferred format occurs
|
||||
if ((author and creators and creators[0] == author[0]) or (not author and not creators)) and od_title.lower() == title.lower() and int(formatid) in [1, 50, 410, 900] and thumbimage:
|
||||
return self.format_results(reserveid, od_title, subtitle, series, publisher,
|
||||
@ -330,9 +338,9 @@ class OverDrive(Source):
|
||||
def find_ovrdrv_data(self, br, log, title, author, isbn, ovrdrv_id=None):
|
||||
q = base_url
|
||||
if ovrdrv_id is None:
|
||||
return self.overdrive_search(br, log, q, title, author)
|
||||
return self.overdrive_search(br, log, q, title, author)
|
||||
else:
|
||||
return self.overdrive_get_record(br, log, q, ovrdrv_id)
|
||||
return self.overdrive_get_record(br, log, q, ovrdrv_id)
|
||||
|
||||
|
||||
|
||||
@ -461,10 +469,10 @@ if __name__ == '__main__':
|
||||
[
|
||||
|
||||
(
|
||||
{'title':'Foundation and Earth',
|
||||
'authors':['Asimov']},
|
||||
[title_test('Foundation and Earth', exact=True),
|
||||
authors_test(['Isaac Asimov'])]
|
||||
{'title':'The Sea Kings Daughter',
|
||||
'authors':['Elizabeth Peters']},
|
||||
[title_test('The Sea Kings Daughter', exact=False),
|
||||
authors_test(['Elizabeth Peters'])]
|
||||
),
|
||||
|
||||
(
|
||||
|
@ -48,7 +48,8 @@ def merge_result(oldmi, newmi, ensure_fields=None):
|
||||
|
||||
return newmi
|
||||
|
||||
def main(do_identify, covers, metadata, ensure_fields):
|
||||
def main(do_identify, covers, metadata, ensure_fields, tdir):
|
||||
os.chdir(tdir)
|
||||
failed_ids = set()
|
||||
failed_covers = set()
|
||||
all_failed = True
|
||||
@ -103,7 +104,8 @@ def single_identify(title, authors, identifiers):
|
||||
return [metadata_to_opf(r) for r in results], [r.has_cached_cover_url for
|
||||
r in results], dump_caches(), log.dump()
|
||||
|
||||
def single_covers(title, authors, identifiers, caches):
|
||||
def single_covers(title, authors, identifiers, caches, tdir):
|
||||
os.chdir(tdir)
|
||||
load_caches(caches)
|
||||
log = GUILog()
|
||||
results = Queue()
|
||||
|
@ -295,21 +295,21 @@ class MOBIHeader(object): # {{{
|
||||
self.datp_record_count, = struct.unpack(b'>I', self.raw[124:128])
|
||||
self.exth_flags, = struct.unpack(b'>I', self.raw[128:132])
|
||||
self.has_exth = bool(self.exth_flags & 0x40)
|
||||
self.has_drm_data = self.length >= 174 and len(self.raw) >= 180
|
||||
self.has_drm_data = self.length >= 174 and len(self.raw) >= 184
|
||||
if self.has_drm_data:
|
||||
self.unknown3 = self.raw[132:164]
|
||||
self.drm_offset, = struct.unpack(b'>I', self.raw[164:168])
|
||||
self.drm_count, = struct.unpack(b'>I', self.raw[168:172])
|
||||
self.drm_size, = struct.unpack(b'>I', self.raw[172:176])
|
||||
self.drm_flags = bin(struct.unpack(b'>I', self.raw[176:180])[0])
|
||||
self.unknown3 = self.raw[132:168]
|
||||
self.drm_offset, self.drm_count, self.drm_size, self.drm_flags = \
|
||||
struct.unpack(b'>4I', self.raw[168:184])
|
||||
self.has_extra_data_flags = self.length >= 232 and len(self.raw) >= 232+16
|
||||
self.has_fcis_flis = False
|
||||
self.has_multibytes = self.has_indexing_bytes = self.has_uncrossable_breaks = False
|
||||
self.extra_data_flags = 0
|
||||
if self.has_extra_data_flags:
|
||||
self.unknown4 = self.raw[180:192]
|
||||
self.fdst_idx, self.fdst_count = struct.unpack_from(b'>II',
|
||||
self.unknown4 = self.raw[184:192]
|
||||
self.fdst_idx, self.fdst_count = struct.unpack_from(b'>LL',
|
||||
self.raw, 192)
|
||||
if self.fdst_count <= 1:
|
||||
self.fdst_idx = NULL_INDEX
|
||||
(self.fcis_number, self.fcis_count, self.flis_number,
|
||||
self.flis_count) = struct.unpack(b'>IIII',
|
||||
self.raw[200:216])
|
||||
@ -327,7 +327,7 @@ class MOBIHeader(object): # {{{
|
||||
self.primary_index_record, = struct.unpack(b'>I',
|
||||
self.raw[244:248])
|
||||
|
||||
if self.file_version >= 8:
|
||||
if self.length >= 248:
|
||||
(self.sect_idx, self.skel_idx, self.datp_idx, self.oth_idx
|
||||
) = struct.unpack_from(b'>4L', self.raw, 248)
|
||||
self.unknown9 = self.raw[264:self.length]
|
||||
@ -337,12 +337,13 @@ class MOBIHeader(object): # {{{
|
||||
|
||||
# The following are all relative to the position of the header record
|
||||
# make them absolute for ease of debugging
|
||||
for x in ('sect_idx', 'skel_idx', 'datp_idx', 'oth_idx',
|
||||
self.relative_records = {'sect_idx', 'skel_idx', 'datp_idx', 'oth_idx',
|
||||
'meta_orth_indx', 'huffman_record_offset',
|
||||
'first_non_book_record', 'datp_record_offset', 'fcis_number',
|
||||
'flis_number', 'primary_index_record', 'fdst_idx',
|
||||
'first_image_index'):
|
||||
if hasattr(self, x):
|
||||
'first_image_index'}
|
||||
for x in self.relative_records:
|
||||
if hasattr(self, x) and getattr(self, x) != NULL_INDEX:
|
||||
setattr(self, x, self.header_offset+getattr(self, x))
|
||||
|
||||
if self.has_exth:
|
||||
@ -355,70 +356,79 @@ class MOBIHeader(object): # {{{
|
||||
|
||||
def __str__(self):
|
||||
ans = ['*'*20 + ' MOBI %d Header '%self.file_version+ '*'*20]
|
||||
|
||||
a = ans.append
|
||||
i = lambda d, x : a('%s (null value: %d): %d'%(d, NULL_INDEX, x))
|
||||
ans.append('Compression: %s'%self.compression)
|
||||
ans.append('Unused: %r'%self.unused)
|
||||
ans.append('Number of text records: %d'%self.number_of_text_records)
|
||||
ans.append('Text record size: %d'%self.text_record_size)
|
||||
ans.append('Encryption: %s'%self.encryption_type)
|
||||
ans.append('Unknown: %r'%self.unknown)
|
||||
ans.append('Identifier: %r'%self.identifier)
|
||||
ans.append('Header length: %d'% self.length)
|
||||
ans.append('Type: %s'%self.type)
|
||||
ans.append('Encoding: %s'%self.encoding)
|
||||
ans.append('UID: %r'%self.uid)
|
||||
ans.append('File version: %d'%self.file_version)
|
||||
i('Meta Orth Index (Sections index in KF8)', self.meta_orth_indx)
|
||||
i('Meta Infl Index', self.meta_infl_indx)
|
||||
ans.append('Secondary index record: %d (null val: %d)'%(
|
||||
self.secondary_index_record, NULL_INDEX))
|
||||
ans.append('Reserved: %r'%self.reserved)
|
||||
ans.append('First non-book record (null value: %d): %d'%(NULL_INDEX,
|
||||
self.first_non_book_record))
|
||||
ans.append('Full name offset: %d'%self.fullname_offset)
|
||||
ans.append('Full name length: %d bytes'%self.fullname_length)
|
||||
ans.append('Langcode: %r'%self.locale_raw)
|
||||
ans.append('Language: %s'%self.language)
|
||||
ans.append('Sub language: %s'%self.sublanguage)
|
||||
ans.append('Input language: %r'%self.input_language)
|
||||
ans.append('Output language: %r'%self.output_langauage)
|
||||
ans.append('Min version: %d'%self.min_version)
|
||||
ans.append('First Image index: %d'%self.first_image_index)
|
||||
ans.append('Huffman record offset: %d'%self.huffman_record_offset)
|
||||
ans.append('Huffman record count: %d'%self.huffman_record_count)
|
||||
ans.append('DATP record offset: %r'%self.datp_record_offset)
|
||||
ans.append('DATP record count: %r'%self.datp_record_count)
|
||||
ans.append('EXTH flags: %s (%s)'%(bin(self.exth_flags)[2:], self.has_exth))
|
||||
|
||||
def i(d, x):
|
||||
x = 'NULL' if x == NULL_INDEX else x
|
||||
a('%s: %s'%(d, x))
|
||||
|
||||
def r(d, attr):
|
||||
x = getattr(self, attr)
|
||||
if attr in self.relative_records and x != NULL_INDEX:
|
||||
a('%s: Absolute: %d Relative: %d'%(d, x, x-self.header_offset))
|
||||
else:
|
||||
i(d, x)
|
||||
|
||||
a('Compression: %s'%self.compression)
|
||||
a('Unused: %r'%self.unused)
|
||||
a('Number of text records: %d'%self.number_of_text_records)
|
||||
a('Text record size: %d'%self.text_record_size)
|
||||
a('Encryption: %s'%self.encryption_type)
|
||||
a('Unknown: %r'%self.unknown)
|
||||
a('Identifier: %r'%self.identifier)
|
||||
a('Header length: %d'% self.length)
|
||||
a('Type: %s'%self.type)
|
||||
a('Encoding: %s'%self.encoding)
|
||||
a('UID: %r'%self.uid)
|
||||
a('File version: %d'%self.file_version)
|
||||
r('Meta Orth Index', 'meta_orth_indx')
|
||||
r('Meta Infl Index', 'meta_infl_indx')
|
||||
r('Secondary index record', 'secondary_index_record')
|
||||
a('Reserved: %r'%self.reserved)
|
||||
r('First non-book record', 'first_non_book_record')
|
||||
a('Full name offset: %d'%self.fullname_offset)
|
||||
a('Full name length: %d bytes'%self.fullname_length)
|
||||
a('Langcode: %r'%self.locale_raw)
|
||||
a('Language: %s'%self.language)
|
||||
a('Sub language: %s'%self.sublanguage)
|
||||
a('Input language: %r'%self.input_language)
|
||||
a('Output language: %r'%self.output_langauage)
|
||||
a('Min version: %d'%self.min_version)
|
||||
r('First Image index', 'first_image_index')
|
||||
r('Huffman record offset', 'huffman_record_offset')
|
||||
a('Huffman record count: %d'%self.huffman_record_count)
|
||||
r('DATP record offset', 'datp_record_offset')
|
||||
a('DATP record count: %r'%self.datp_record_count)
|
||||
a('EXTH flags: %s (%s)'%(bin(self.exth_flags)[2:], self.has_exth))
|
||||
if self.has_drm_data:
|
||||
ans.append('Unknown3: %r'%self.unknown3)
|
||||
ans.append('DRM Offset: %s'%self.drm_offset)
|
||||
ans.append('DRM Count: %s'%self.drm_count)
|
||||
ans.append('DRM Size: %s'%self.drm_size)
|
||||
ans.append('DRM Flags: %r'%self.drm_flags)
|
||||
a('Unknown3: %r'%self.unknown3)
|
||||
r('DRM Offset', 'drm_offset')
|
||||
a('DRM Count: %s'%self.drm_count)
|
||||
a('DRM Size: %s'%self.drm_size)
|
||||
a('DRM Flags: %r'%self.drm_flags)
|
||||
if self.has_extra_data_flags:
|
||||
ans.append('Unknown4: %r'%self.unknown4)
|
||||
ans.append('FDST Index: %d'% self.fdst_idx)
|
||||
ans.append('FDST Count: %d'% self.fdst_count)
|
||||
ans.append('FCIS number: %d'% self.fcis_number)
|
||||
ans.append('FCIS count: %d'% self.fcis_count)
|
||||
ans.append('FLIS number: %d'% self.flis_number)
|
||||
ans.append('FLIS count: %d'% self.flis_count)
|
||||
ans.append('Unknown6: %r'% self.unknown6)
|
||||
ans.append('SRCS record index: %d'%self.srcs_record_index)
|
||||
ans.append('Number of SRCS records?: %d'%self.num_srcs_records)
|
||||
ans.append('Unknown7: %r'%self.unknown7)
|
||||
ans.append(('Extra data flags: %s (has multibyte: %s) '
|
||||
a('Unknown4: %r'%self.unknown4)
|
||||
r('FDST Index', 'fdst_idx')
|
||||
a('FDST Count: %d'% self.fdst_count)
|
||||
r('FCIS number', 'fcis_number')
|
||||
a('FCIS count: %d'% self.fcis_count)
|
||||
r('FLIS number', 'flis_number')
|
||||
a('FLIS count: %d'% self.flis_count)
|
||||
a('Unknown6: %r'% self.unknown6)
|
||||
r('SRCS record index', 'srcs_record_index')
|
||||
a('Number of SRCS records?: %d'%self.num_srcs_records)
|
||||
a('Unknown7: %r'%self.unknown7)
|
||||
a(('Extra data flags: %s (has multibyte: %s) '
|
||||
'(has indexing: %s) (has uncrossable breaks: %s)')%(
|
||||
bin(self.extra_data_flags), self.has_multibytes,
|
||||
self.has_indexing_bytes, self.has_uncrossable_breaks ))
|
||||
ans.append('Primary index record (null value: %d): %d'%(NULL_INDEX,
|
||||
self.primary_index_record))
|
||||
if self.file_version >= 8:
|
||||
i('Sections Index', self.sect_idx)
|
||||
i('SKEL Index', self.skel_idx)
|
||||
i('DATP Index', self.datp_idx)
|
||||
i('Other Index', self.oth_idx)
|
||||
r('NCX index', 'primary_index_record')
|
||||
if self.length >= 248:
|
||||
r('Sections Index', 'sect_idx')
|
||||
r('SKEL Index', 'skel_idx')
|
||||
r('DATP Index', 'datp_idx')
|
||||
r('Other Index', 'oth_idx')
|
||||
if self.unknown9:
|
||||
a('Unknown9: %r'%self.unknown9)
|
||||
|
||||
|
185
src/calibre/ebooks/mobi/debug/index.py
Normal file
185
src/calibre/ebooks/mobi/debug/index.py
Normal file
@ -0,0 +1,185 @@
|
||||
#!/usr/bin/env python
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
from __future__ import (unicode_literals, division, absolute_import,
|
||||
print_function)
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
from collections import OrderedDict, namedtuple
|
||||
|
||||
from calibre.ebooks.mobi.reader.headers import NULL_INDEX
|
||||
from calibre.ebooks.mobi.reader.index import (CNCX, parse_indx_header,
|
||||
parse_tagx_section, parse_index_record, INDEX_HEADER_FIELDS)
|
||||
from calibre.ebooks.mobi.reader.ncx import (tag_fieldname_map, default_entry)
|
||||
|
||||
File = namedtuple('File',
|
||||
'file_number name divtbl_count start_position length')
|
||||
|
||||
Elem = namedtuple('Chunk',
|
||||
'insert_pos toc_text file_number sequence_number start_pos '
|
||||
'length')
|
||||
|
||||
GuideRef = namedtuple('GuideRef', 'type title pos_fid')
|
||||
|
||||
def read_index(sections, idx, codec):
|
||||
table, cncx = OrderedDict(), CNCX([], codec)
|
||||
|
||||
data = sections[idx].raw
|
||||
|
||||
indx_header = parse_indx_header(data)
|
||||
indx_count = indx_header['count']
|
||||
|
||||
if indx_header['ncncx'] > 0:
|
||||
off = idx + indx_count + 1
|
||||
cncx_records = [x.raw for x in sections[off:off+indx_header['ncncx']]]
|
||||
cncx = CNCX(cncx_records, codec)
|
||||
|
||||
tag_section_start = indx_header['tagx']
|
||||
control_byte_count, tags = parse_tagx_section(data[tag_section_start:])
|
||||
|
||||
for i in xrange(idx + 1, idx + 1 + indx_count):
|
||||
# Index record
|
||||
data = sections[i].raw
|
||||
parse_index_record(table, data, control_byte_count, tags, codec,
|
||||
indx_header['ordt_map'], strict=True)
|
||||
return table, cncx, indx_header
|
||||
|
||||
class Index(object):
|
||||
|
||||
def __init__(self, idx, records, codec):
|
||||
self.table = self.cncx = self.header = self.records = None
|
||||
if idx != NULL_INDEX:
|
||||
self.table, self.cncx, self.header = read_index(records, idx, codec)
|
||||
|
||||
def render(self):
|
||||
ans = ['*'*10 + ' Index Header ' + '*'*10]
|
||||
a = ans.append
|
||||
if self.header is not None:
|
||||
for field in INDEX_HEADER_FIELDS:
|
||||
a('%-12s: %r'%(field, self.header[field]))
|
||||
ans.extend(['', ''])
|
||||
|
||||
if self.cncx:
|
||||
a('*'*10 + ' CNCX ' + '*'*10)
|
||||
for offset, val in self.cncx.iteritems():
|
||||
a('%10s: %s'%(offset, val))
|
||||
ans.extend(['', ''])
|
||||
|
||||
if self.table is not None:
|
||||
a('*'*10 + ' %d Index Entries '%len(self.table) + '*'*10)
|
||||
for k, v in self.table.iteritems():
|
||||
a('%s: %r'%(k, v))
|
||||
|
||||
if self.records:
|
||||
ans.extend(['', '', '*'*10 + ' Parsed Entries ' + '*'*10])
|
||||
for f in self.records:
|
||||
a(repr(f))
|
||||
|
||||
return ans + ['']
|
||||
|
||||
def __str__(self):
|
||||
return '\n'.join(self.render())
|
||||
|
||||
def __iter__(self):
|
||||
return iter(self.records)
|
||||
|
||||
class SKELIndex(Index):
|
||||
|
||||
def __init__(self, skelidx, records, codec):
|
||||
super(SKELIndex, self).__init__(skelidx, records, codec)
|
||||
self.records = []
|
||||
|
||||
if self.table is not None:
|
||||
for i, text in enumerate(self.table.iterkeys()):
|
||||
tag_map = self.table[text]
|
||||
if set(tag_map.iterkeys()) != {1, 6}:
|
||||
raise ValueError('SKEL Index has unknown tags: %s'%
|
||||
(set(tag_map.iterkeys())-{1,6}))
|
||||
self.records.append(File(
|
||||
i, # file_number
|
||||
text, # name
|
||||
tag_map[1][0], # divtbl_count
|
||||
tag_map[6][0], # start_pos
|
||||
tag_map[6][1]) # length
|
||||
)
|
||||
|
||||
class SECTIndex(Index):
|
||||
|
||||
def __init__(self, sectidx, records, codec):
|
||||
super(SECTIndex, self).__init__(sectidx, records, codec)
|
||||
self.records = []
|
||||
|
||||
if self.table is not None:
|
||||
for i, text in enumerate(self.table.iterkeys()):
|
||||
tag_map = self.table[text]
|
||||
if set(tag_map.iterkeys()) != {2, 3, 4, 6}:
|
||||
raise ValueError('Chunk Index has unknown tags: %s'%
|
||||
(set(tag_map.iterkeys())-{2, 3, 4, 6}))
|
||||
|
||||
toc_text = self.cncx[tag_map[2][0]]
|
||||
self.records.append(Elem(
|
||||
int(text), # insert_pos
|
||||
toc_text, # toc_text
|
||||
tag_map[3][0], # file_number
|
||||
tag_map[4][0], # sequence_number
|
||||
tag_map[6][0], # start_pos
|
||||
tag_map[6][1] # length
|
||||
)
|
||||
)
|
||||
|
||||
class GuideIndex(Index):
|
||||
|
||||
def __init__(self, guideidx, records, codec):
|
||||
super(GuideIndex, self).__init__(guideidx, records, codec)
|
||||
self.records = []
|
||||
|
||||
if self.table is not None:
|
||||
for i, text in enumerate(self.table.iterkeys()):
|
||||
tag_map = self.table[text]
|
||||
if set(tag_map.iterkeys()) not in ({1, 6}, {1, 2, 3}):
|
||||
raise ValueError('Guide Index has unknown tags: %s'%
|
||||
tag_map)
|
||||
|
||||
title = self.cncx[tag_map[1][0]]
|
||||
self.records.append(GuideRef(
|
||||
text,
|
||||
title,
|
||||
tag_map[6] if 6 in tag_map else (tag_map[2], tag_map[3])
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
class NCXIndex(Index):
|
||||
|
||||
def __init__(self, ncxidx, records, codec):
|
||||
super(NCXIndex, self).__init__(ncxidx, records, codec)
|
||||
self.records = []
|
||||
|
||||
if self.table is not None:
|
||||
for num, x in enumerate(self.table.iteritems()):
|
||||
text, tag_map = x
|
||||
entry = default_entry.copy()
|
||||
entry['name'] = text
|
||||
entry['num'] = num
|
||||
|
||||
for tag in tag_fieldname_map.iterkeys():
|
||||
fieldname, i = tag_fieldname_map[tag]
|
||||
if tag in tag_map:
|
||||
fieldvalue = tag_map[tag][i]
|
||||
if tag == 6:
|
||||
# Appears to be an idx into the KF8 elems table with an
|
||||
# offset
|
||||
fieldvalue = tuple(tag_map[tag])
|
||||
entry[fieldname] = fieldvalue
|
||||
for which, name in {3:'text', 5:'kind', 70:'description',
|
||||
71:'author', 72:'image_caption',
|
||||
73:'image_attribution'}.iteritems():
|
||||
if tag == which:
|
||||
entry[name] = self.cncx.get(fieldvalue,
|
||||
default_entry[name])
|
||||
self.records.append(entry)
|
||||
|
||||
|
||||
|
@ -10,8 +10,11 @@ __docformat__ = 'restructuredtext en'
|
||||
import sys, os, imghdr, struct
|
||||
from itertools import izip
|
||||
|
||||
from calibre import CurrentDir
|
||||
from calibre.ebooks.mobi.debug.headers import TextRecord
|
||||
from calibre.ebooks.mobi.utils import read_font_record
|
||||
from calibre.ebooks.mobi.debug.index import (SKELIndex, SECTIndex, NCXIndex,
|
||||
GuideIndex)
|
||||
from calibre.ebooks.mobi.utils import read_font_record, decode_tbs
|
||||
from calibre.ebooks.mobi.debug import format_bytes
|
||||
from calibre.ebooks.mobi.reader.headers import NULL_INDEX
|
||||
|
||||
@ -42,6 +45,24 @@ class FDST(object):
|
||||
|
||||
return '\n'.join(ans)
|
||||
|
||||
class File(object):
|
||||
|
||||
def __init__(self, skel, skeleton, text, first_aid, sections):
|
||||
self.name = 'part%04d'%skel.file_number
|
||||
self.skeleton, self.text, self.first_aid = skeleton, text, first_aid
|
||||
self.sections = sections
|
||||
|
||||
def dump(self, ddir):
|
||||
with open(os.path.join(ddir, self.name + '.html'), 'wb') as f:
|
||||
f.write(self.text)
|
||||
base = os.path.join(ddir, self.name + '-parts')
|
||||
os.mkdir(base)
|
||||
with CurrentDir(base):
|
||||
with open('skeleton.html', 'wb') as f:
|
||||
f.write(self.skeleton)
|
||||
for i, text in enumerate(self.sections):
|
||||
with open('sect-%04d.html'%i, 'wb') as f:
|
||||
f.write(text)
|
||||
|
||||
class MOBIFile(object):
|
||||
|
||||
@ -65,6 +86,9 @@ class MOBIFile(object):
|
||||
self.header = self.mf.mobi8_header
|
||||
self.extract_resources()
|
||||
self.read_fdst()
|
||||
self.read_indices()
|
||||
self.build_files()
|
||||
self.read_tbs()
|
||||
|
||||
def print_header(self, f=sys.stdout):
|
||||
print (str(self.mf.palmdb).encode('utf-8'), file=f)
|
||||
@ -85,6 +109,45 @@ class MOBIFile(object):
|
||||
if self.fdst.num_sections != self.header.fdst_count:
|
||||
raise ValueError('KF8 Header contains invalid FDST count')
|
||||
|
||||
def read_indices(self):
|
||||
self.skel_index = SKELIndex(self.header.skel_idx, self.mf.records,
|
||||
self.header.encoding)
|
||||
self.sect_index = SECTIndex(self.header.sect_idx, self.mf.records,
|
||||
self.header.encoding)
|
||||
self.ncx_index = NCXIndex(self.header.primary_index_record,
|
||||
self.mf.records, self.header.encoding)
|
||||
self.guide_index = GuideIndex(self.header.oth_idx, self.mf.records,
|
||||
self.header.encoding)
|
||||
|
||||
def build_files(self):
|
||||
text = self.raw_text
|
||||
self.files = []
|
||||
for skel in self.skel_index.records:
|
||||
sects = [x for x in self.sect_index.records if x.file_number
|
||||
== skel.file_number]
|
||||
skeleton = text[skel.start_position:skel.start_position+skel.length]
|
||||
ftext = skeleton
|
||||
first_aid = sects[0].toc_text
|
||||
sections = []
|
||||
|
||||
for sect in sects:
|
||||
start_pos = skel.start_position + skel.length + sect.start_pos
|
||||
sect_text = text[start_pos:start_pos+sect.length]
|
||||
insert_pos = sect.insert_pos - skel.start_position
|
||||
ftext = ftext[:insert_pos] + sect_text + ftext[insert_pos:]
|
||||
sections.append(sect_text)
|
||||
|
||||
self.files.append(File(skel, skeleton, ftext, first_aid, sections))
|
||||
|
||||
def dump_flows(self, ddir):
|
||||
if self.fdst is None:
|
||||
raise ValueError('This MOBI file has no FDST record')
|
||||
for i, x in enumerate(self.fdst.sections):
|
||||
start, end = x
|
||||
raw = self.raw_text[start:end]
|
||||
with open(os.path.join(ddir, 'flow%04d.txt'%i), 'wb') as f:
|
||||
f.write(raw)
|
||||
|
||||
def extract_resources(self):
|
||||
self.resource_map = []
|
||||
known_types = {b'FLIS', b'FCIS', b'SRCS',
|
||||
@ -121,6 +184,54 @@ class MOBIFile(object):
|
||||
self.resource_map.append(('%s/%06d%s.%s'%(prefix, i, suffix, ext),
|
||||
payload))
|
||||
|
||||
def read_tbs(self):
|
||||
from calibre.ebooks.mobi.writer8.tbs import (Entry,
|
||||
collect_indexing_data)
|
||||
entry_map = []
|
||||
for index in self.ncx_index:
|
||||
enders = [e['pos'] for e in self.ncx_index if e['pos'] >
|
||||
index['pos'] and
|
||||
e['hlvl'] <= index['hlvl']]
|
||||
end = min(enders+[len(self.raw_text)])
|
||||
|
||||
entry_map.append(Entry(index=index['num'], title=index['text'],
|
||||
depth=index['hlvl'],
|
||||
parent=index['parent'] if index['parent'] > -1 else None,
|
||||
first_child=index['child1'] if index['child1'] > -1 else None,
|
||||
last_child=index['childn'] if index['childn'] > -1 else None,
|
||||
start=index['pos'], length=end-index['pos']))
|
||||
|
||||
indexing_data = collect_indexing_data(entry_map,
|
||||
len(self.text_records))
|
||||
self.indexing_data = []
|
||||
for i, data in enumerate(indexing_data):
|
||||
rec = self.text_records[i]
|
||||
tbs_bytes = rec.trailing_data.get('indexing', b'')
|
||||
desc = ['Record #%d'%i]
|
||||
for x in ('starts', 'completes', 'ends', 'spans'):
|
||||
points = ['\t%d at depth: %d'%(e.index, e.depth) for e in
|
||||
getattr(data, x)]
|
||||
if points:
|
||||
desc.append(x+':')
|
||||
desc.extend(points)
|
||||
desc.append('TBS Bytes: ' + format_bytes(tbs_bytes))
|
||||
flag_sz = 3
|
||||
sequences = []
|
||||
while tbs_bytes:
|
||||
try:
|
||||
val, extra, consumed = decode_tbs(tbs_bytes, flag_size=flag_sz)
|
||||
except:
|
||||
break
|
||||
flag_sz = 4
|
||||
tbs_bytes = tbs_bytes[consumed:]
|
||||
extra = {bin(k):v for k, v in extra.iteritems()}
|
||||
sequences.append((val, extra))
|
||||
for i, seq in enumerate(sequences):
|
||||
desc.append('Sequence #%d: %r %r'%(i, seq[0], seq[1]))
|
||||
if tbs_bytes:
|
||||
desc.append('Remaining bytes: %s'%format_bytes(tbs_bytes))
|
||||
desc.append('')
|
||||
self.indexing_data.append('\n'.join(desc))
|
||||
|
||||
def inspect_mobi(mobi_file, ddir):
|
||||
f = MOBIFile(mobi_file)
|
||||
@ -131,7 +242,8 @@ def inspect_mobi(mobi_file, ddir):
|
||||
with open(alltext, 'wb') as of:
|
||||
of.write(f.raw_text)
|
||||
|
||||
for x in ('text_records', 'images', 'fonts', 'binary'):
|
||||
for x in ('text_records', 'images', 'fonts', 'binary', 'files', 'flows',
|
||||
'tbs'):
|
||||
os.mkdir(os.path.join(ddir, x))
|
||||
|
||||
for rec in f.text_records:
|
||||
@ -145,3 +257,24 @@ def inspect_mobi(mobi_file, ddir):
|
||||
with open(os.path.join(ddir, 'fdst.record'), 'wb') as fo:
|
||||
fo.write(str(f.fdst).encode('utf-8'))
|
||||
|
||||
with open(os.path.join(ddir, 'skel.record'), 'wb') as fo:
|
||||
fo.write(str(f.skel_index).encode('utf-8'))
|
||||
|
||||
with open(os.path.join(ddir, 'chunks.record'), 'wb') as fo:
|
||||
fo.write(str(f.sect_index).encode('utf-8'))
|
||||
|
||||
with open(os.path.join(ddir, 'ncx.record'), 'wb') as fo:
|
||||
fo.write(str(f.ncx_index).encode('utf-8'))
|
||||
|
||||
with open(os.path.join(ddir, 'guide.record'), 'wb') as fo:
|
||||
fo.write(str(f.guide_index).encode('utf-8'))
|
||||
|
||||
with open(os.path.join(ddir, 'tbs', 'all.txt'), 'wb') as fo:
|
||||
fo.write(('\n'.join(f.indexing_data)).encode('utf-8'))
|
||||
|
||||
for part in f.files:
|
||||
part.dump(os.path.join(ddir, 'files'))
|
||||
|
||||
f.dump_flows(os.path.join(ddir, 'flows'))
|
||||
|
||||
|
||||
|
@ -10,7 +10,7 @@ import copy
|
||||
import re
|
||||
from lxml import etree
|
||||
from calibre.ebooks.oeb.base import namespace, barename
|
||||
from calibre.ebooks.oeb.base import XHTML, XHTML_NS, OEB_DOCS, urlnormalize
|
||||
from calibre.ebooks.oeb.base import XHTML, XHTML_NS, urlnormalize
|
||||
from calibre.ebooks.oeb.stylizer import Stylizer
|
||||
from calibre.ebooks.oeb.transforms.flatcss import KeyMapper
|
||||
from calibre.utils.magick.draw import identify_data
|
||||
@ -109,26 +109,8 @@ class MobiMLizer(object):
|
||||
self.profile = profile = context.dest
|
||||
self.fnums = fnums = dict((v, k) for k, v in profile.fnums.items())
|
||||
self.fmap = KeyMapper(profile.fbase, profile.fbase, fnums.keys())
|
||||
self.remove_html_cover()
|
||||
self.mobimlize_spine()
|
||||
|
||||
def remove_html_cover(self):
|
||||
oeb = self.oeb
|
||||
if not oeb.metadata.cover \
|
||||
or 'cover' not in oeb.guide:
|
||||
return
|
||||
href = oeb.guide['cover'].href
|
||||
del oeb.guide['cover']
|
||||
item = oeb.manifest.hrefs[href]
|
||||
if item.spine_position is not None:
|
||||
self.log.warn('Found an HTML cover,', item.href, 'removing it.',
|
||||
'If you find some content missing from the output MOBI, it '
|
||||
'is because you misidentified the HTML cover in the input '
|
||||
'document')
|
||||
oeb.spine.remove(item)
|
||||
if item.media_type in OEB_DOCS:
|
||||
self.oeb.manifest.remove(item)
|
||||
|
||||
def mobimlize_spine(self):
|
||||
'Iterate over the spine and convert it to MOBIML'
|
||||
for item in self.oeb.spine:
|
||||
@ -473,7 +455,7 @@ class MobiMLizer(object):
|
||||
if tag in TABLE_TAGS and self.ignore_tables:
|
||||
tag = 'span' if tag == 'td' else 'div'
|
||||
|
||||
if tag == 'table':
|
||||
if tag in ('table', 'td', 'tr'):
|
||||
col = style.backgroundColor
|
||||
if col:
|
||||
elem.set('bgcolor', col)
|
||||
|
@ -111,6 +111,13 @@ class CNCX(object): # {{{
|
||||
|
||||
def get(self, offset, default=None):
|
||||
return self.records.get(offset, default)
|
||||
|
||||
def __bool__(self):
|
||||
return bool(self.records)
|
||||
__nonzero__ = __bool__
|
||||
|
||||
def iteritems(self):
|
||||
return self.records.iteritems()
|
||||
# }}}
|
||||
|
||||
def parse_tagx_section(data):
|
||||
|
@ -223,15 +223,15 @@ def insert_images_into_markup(parts, resource_map, log):
|
||||
# Handle any embedded raster images links in the xhtml text
|
||||
# kindle:embed:XXXX?mime=image/gif (png, jpeg, etc) (used for images)
|
||||
img_pattern = re.compile(r'''(<[img\s|image\s][^>]*>)''', re.IGNORECASE)
|
||||
img_index_pattern = re.compile(r'''['"]kindle:embed:([0-9|A-V]+)[^'"]*['"]''')
|
||||
img_index_pattern = re.compile(r'''[('"]kindle:embed:([0-9|A-V]+)[^')"]*[)'"]''')
|
||||
|
||||
style_pattern = re.compile(r'''(<[a-zA-Z0-9]+\s[^>]*style\s*=\s*[^>]*>)''',
|
||||
re.IGNORECASE)
|
||||
|
||||
for i in xrange(len(parts)):
|
||||
part = parts[i]
|
||||
#[partnum, dir, filename, beg, end, aidtext] = self.k8proc.partinfo[i]
|
||||
|
||||
# links to raster image files
|
||||
# image_pattern
|
||||
srcpieces = img_pattern.split(part)
|
||||
for j in range(1, len(srcpieces), 2):
|
||||
for j in xrange(1, len(srcpieces), 2):
|
||||
tag = srcpieces[j]
|
||||
if tag.startswith('<im'):
|
||||
for m in img_index_pattern.finditer(tag):
|
||||
@ -248,6 +248,30 @@ def insert_images_into_markup(parts, resource_map, log):
|
||||
# store away modified version
|
||||
parts[i] = part
|
||||
|
||||
# Replace urls used in style attributes
|
||||
for i in xrange(len(parts)):
|
||||
part = parts[i]
|
||||
srcpieces = style_pattern.split(part)
|
||||
for j in xrange(1, len(srcpieces), 2):
|
||||
tag = srcpieces[j]
|
||||
if 'kindle:embed' in tag:
|
||||
for m in img_index_pattern.finditer(tag):
|
||||
num = int(m.group(1), 32)
|
||||
href = resource_map[num-1]
|
||||
osep = m.group()[0]
|
||||
csep = m.group()[-1]
|
||||
if href:
|
||||
replacement = '%s%s%s'%(osep, '../' + href, csep)
|
||||
tag = img_index_pattern.sub(replacement, tag, 1)
|
||||
else:
|
||||
log.warn('Referenced image %s was not recognized as '
|
||||
'a valid image in %s' % (num, tag))
|
||||
srcpieces[j] = tag
|
||||
part = "".join(srcpieces)
|
||||
# store away modified version
|
||||
parts[i] = part
|
||||
|
||||
|
||||
def upshift_markup(parts):
|
||||
tag_pattern = re.compile(r'''(<(?:svg)[^>]*>)''', re.IGNORECASE)
|
||||
|
||||
|
@ -109,7 +109,7 @@ class Mobi8Reader(object):
|
||||
table, cncx = read_index(self.kf8_sections, self.header.othidx,
|
||||
self.header.codec)
|
||||
Item = namedtuple('Item',
|
||||
'type title div_frag_num')
|
||||
'type title pos_fid')
|
||||
|
||||
for i, ref_type in enumerate(table.iterkeys()):
|
||||
tag_map = table[ref_type]
|
||||
@ -119,7 +119,7 @@ class Mobi8Reader(object):
|
||||
if 3 in tag_map.keys():
|
||||
fileno = tag_map[3][0]
|
||||
if 6 in tag_map.keys():
|
||||
fileno = tag_map[6][0]
|
||||
fileno = tag_map[6]
|
||||
self.guide.append(Item(ref_type.decode(self.header.codec),
|
||||
title, fileno))
|
||||
|
||||
@ -287,23 +287,24 @@ class Mobi8Reader(object):
|
||||
|
||||
def create_guide(self):
|
||||
guide = Guide()
|
||||
for ref_type, ref_title, fileno in self.guide:
|
||||
has_start = False
|
||||
for ref_type, ref_title, pos_fid in self.guide:
|
||||
try:
|
||||
elem = self.elems[fileno]
|
||||
except IndexError:
|
||||
# Happens for thumbnailstandard in Amazon book samples
|
||||
continue
|
||||
fi = self.get_file_info(elem.insert_pos)
|
||||
idtext = self.get_id_tag(elem.insert_pos).decode(self.header.codec)
|
||||
linktgt = fi.filename
|
||||
if len(pos_fid) != 2:
|
||||
continue
|
||||
except TypeError:
|
||||
continue # thumbnailstandard record, ignore it
|
||||
linktgt, idtext = self.get_id_tag_by_pos_fid(*pos_fid)
|
||||
if idtext:
|
||||
linktgt += b'#' + idtext
|
||||
g = Guide.Reference('%s/%s'%(fi.type, linktgt), os.getcwdu())
|
||||
g = Guide.Reference(linktgt, os.getcwdu())
|
||||
g.title, g.type = ref_title, ref_type
|
||||
if g.title == 'start' or g.type == 'text':
|
||||
has_start = True
|
||||
guide.append(g)
|
||||
|
||||
so = self.header.exth.start_offset
|
||||
if so not in {None, NULL_INDEX}:
|
||||
if so not in {None, NULL_INDEX} and not has_start:
|
||||
fi = self.get_file_info(so)
|
||||
if fi.filename is not None:
|
||||
idtext = self.get_id_tag(so).decode(self.header.codec)
|
||||
|
@ -7,13 +7,15 @@ __license__ = 'GPL v3'
|
||||
__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import struct, string, imghdr, zlib
|
||||
import struct, string, imghdr, zlib, os
|
||||
from collections import OrderedDict
|
||||
from io import BytesIO
|
||||
|
||||
from calibre.utils.magick.draw import Image, save_cover_data_to, thumbnail
|
||||
from calibre.ebooks import normalize
|
||||
|
||||
IMAGE_MAX_SIZE = 10 * 1024 * 1024
|
||||
RECORD_SIZE = 0x1000 # 4096 (Text record size (uncompressed))
|
||||
|
||||
def decode_string(raw, codec='utf-8', ordt_map=''):
|
||||
length, = struct.unpack(b'>B', raw[0])
|
||||
@ -364,15 +366,17 @@ def count_set_bits(num):
|
||||
num >>= 1
|
||||
return ans
|
||||
|
||||
def to_base(num, base=32):
|
||||
def to_base(num, base=32, min_num_digits=None):
|
||||
digits = string.digits + string.ascii_uppercase
|
||||
sign = 1 if num >= 0 else -1
|
||||
if num == 0: return '0'
|
||||
if num == 0: return ('0' if min_num_digits is None else '0'*min_num_digits)
|
||||
num *= sign
|
||||
ans = []
|
||||
while num:
|
||||
ans.append(digits[(num % base)])
|
||||
num //= base
|
||||
if min_num_digits is not None and len(ans) < min_num_digits:
|
||||
ans.extend('0'*(min_num_digits - len(ans)))
|
||||
if sign < 0:
|
||||
ans.append('-')
|
||||
ans.reverse()
|
||||
@ -388,27 +392,8 @@ def mobify_image(data):
|
||||
data = im.export('gif')
|
||||
return data
|
||||
|
||||
def read_zlib_header(header):
|
||||
header = bytearray(header)
|
||||
# See sec 2.2 of RFC 1950 for the zlib stream format
|
||||
# http://www.ietf.org/rfc/rfc1950.txt
|
||||
if (header[0]*256 + header[1])%31 != 0:
|
||||
return None, 'Bad zlib header, FCHECK failed'
|
||||
|
||||
cmf = header[0] & 0b1111
|
||||
cinfo = header[0] >> 4
|
||||
if cmf != 8:
|
||||
return None, 'Unknown zlib compression method: %d'%cmf
|
||||
if cinfo > 7:
|
||||
return None, 'Invalid CINFO field in zlib header: %d'%cinfo
|
||||
fdict = (header[1]&0b10000)>>5
|
||||
if fdict != 0:
|
||||
return None, 'FDICT based zlib compression not supported'
|
||||
wbits = cinfo + 8
|
||||
return wbits, None
|
||||
|
||||
|
||||
def read_font_record(data, extent=1040): # {{{
|
||||
# Font records {{{
|
||||
def read_font_record(data, extent=1040):
|
||||
'''
|
||||
Return the font encoded in the MOBI FONT record represented by data.
|
||||
The return value in a dict with fields raw_data, font_data, err, ext,
|
||||
@ -466,15 +451,8 @@ def read_font_record(data, extent=1040): # {{{
|
||||
|
||||
if flags & 0b1:
|
||||
# ZLIB compressed data
|
||||
wbits, err = read_zlib_header(font_data[:2])
|
||||
if err is not None:
|
||||
ans['err'] = err
|
||||
return ans
|
||||
adler32, = struct.unpack_from(b'>I', font_data, len(font_data) - 4)
|
||||
try:
|
||||
# remove two bytes of zlib header and 4 bytes of trailing checksum
|
||||
# negative wbits indicates no standard gzip header
|
||||
font_data = zlib.decompress(font_data[2:-4], -wbits, usize)
|
||||
font_data = zlib.decompress(font_data)
|
||||
except Exception as e:
|
||||
ans['err'] = 'Failed to zlib decompress font data (%s)'%e
|
||||
return ans
|
||||
@ -483,23 +461,146 @@ def read_font_record(data, extent=1040): # {{{
|
||||
ans['err'] = 'Uncompressed font size mismatch'
|
||||
return ans
|
||||
|
||||
if False:
|
||||
# For some reason these almost never match, probably Amazon has a
|
||||
# buggy Adler32 implementation
|
||||
sig = (zlib.adler32(font_data) & 0xffffffff)
|
||||
if sig != adler32:
|
||||
ans['err'] = ('Adler checksum did not match. Stored: %d '
|
||||
'Calculated: %d')%(adler32, sig)
|
||||
return ans
|
||||
|
||||
ans['font_data'] = font_data
|
||||
sig = font_data[:4]
|
||||
ans['ext'] = ('ttf' if sig in {b'\0\1\0\0', b'true', b'ttcf'}
|
||||
else 'otf' if sig == b'OTTO' else 'dat')
|
||||
|
||||
return ans
|
||||
|
||||
def write_font_record(data, obfuscate=True, compress=True):
|
||||
'''
|
||||
Write the ttf/otf font represented by data into a font record. See
|
||||
read_font_record() for details on the format of the record.
|
||||
'''
|
||||
|
||||
flags = 0
|
||||
key_len = 20
|
||||
usize = len(data)
|
||||
xor_key = b''
|
||||
if compress:
|
||||
flags |= 0b1
|
||||
data = zlib.compress(data, 9)
|
||||
if obfuscate:
|
||||
flags |= 0b10
|
||||
xor_key = os.urandom(key_len)
|
||||
key = bytearray(xor_key)
|
||||
data = bytearray(data)
|
||||
for i in xrange(1040):
|
||||
data[i] ^= key[i%key_len]
|
||||
data = bytes(data)
|
||||
|
||||
key_start = struct.calcsize(b'>5L') + 4
|
||||
data_start = key_start + len(xor_key)
|
||||
|
||||
header = b'FONT' + struct.pack(b'>5L', usize, flags, data_start,
|
||||
len(xor_key), key_start)
|
||||
|
||||
return header + xor_key + data
|
||||
|
||||
# }}}
|
||||
|
||||
def create_text_record(text):
|
||||
'''
|
||||
Return a Palmdoc record of size RECORD_SIZE from the text file object.
|
||||
In case the record ends in the middle of a multibyte character return
|
||||
the overlap as well.
|
||||
|
||||
Returns data, overlap: where both are byte strings. overlap is the
|
||||
extra bytes needed to complete the truncated multibyte character.
|
||||
'''
|
||||
opos = text.tell()
|
||||
text.seek(0, 2)
|
||||
# npos is the position of the next record
|
||||
npos = min((opos + RECORD_SIZE, text.tell()))
|
||||
# Number of bytes from the next record needed to complete the last
|
||||
# character in this record
|
||||
extra = 0
|
||||
|
||||
last = b''
|
||||
while not last.decode('utf-8', 'ignore'):
|
||||
# last contains no valid utf-8 characters
|
||||
size = len(last) + 1
|
||||
text.seek(npos - size)
|
||||
last = text.read(size)
|
||||
|
||||
# last now has one valid utf-8 char and possibly some bytes that belong
|
||||
# to a truncated char
|
||||
|
||||
try:
|
||||
last.decode('utf-8', 'strict')
|
||||
except UnicodeDecodeError:
|
||||
# There are some truncated bytes in last
|
||||
prev = len(last)
|
||||
while True:
|
||||
text.seek(npos - prev)
|
||||
last = text.read(len(last) + 1)
|
||||
try:
|
||||
last.decode('utf-8')
|
||||
except UnicodeDecodeError:
|
||||
pass
|
||||
else:
|
||||
break
|
||||
extra = len(last) - prev
|
||||
|
||||
text.seek(opos)
|
||||
data = text.read(RECORD_SIZE)
|
||||
overlap = text.read(extra)
|
||||
text.seek(npos)
|
||||
|
||||
return data, overlap
|
||||
|
||||
class CNCX(object): # {{{
|
||||
|
||||
'''
|
||||
Create the CNCX records. These are records containing all the strings from
|
||||
an index. Each record is of the form: <vwi string size><utf-8 encoded
|
||||
string>
|
||||
'''
|
||||
|
||||
MAX_STRING_LENGTH = 500
|
||||
|
||||
def __init__(self, strings=()):
|
||||
self.strings = OrderedDict((s, 0) for s in strings)
|
||||
|
||||
self.records = []
|
||||
offset = 0
|
||||
buf = BytesIO()
|
||||
for key in tuple(self.strings.iterkeys()):
|
||||
utf8 = utf8_text(key[:self.MAX_STRING_LENGTH])
|
||||
l = len(utf8)
|
||||
sz_bytes = encint(l)
|
||||
raw = sz_bytes + utf8
|
||||
if 0xfbf8 - buf.tell() < 6 + len(raw):
|
||||
# Records in PDB files cannot be larger than 0x10000, so we
|
||||
# stop well before that.
|
||||
pad = 0xfbf8 - buf.tell()
|
||||
buf.write(b'\0' * pad)
|
||||
self.records.append(buf.getvalue())
|
||||
buf.seek(0), buf.truncate(0)
|
||||
offset = len(self.records) * 0x10000
|
||||
buf.write(raw)
|
||||
self.strings[key] = offset
|
||||
offset += len(raw)
|
||||
|
||||
val = buf.getvalue()
|
||||
if val:
|
||||
self.records.append(align_block(val))
|
||||
|
||||
def __getitem__(self, string):
|
||||
return self.strings[string]
|
||||
|
||||
def __bool__(self):
|
||||
return bool(self.records)
|
||||
__nonzero__ = __bool__
|
||||
|
||||
def __len__(self):
|
||||
return len(self.records)
|
||||
|
||||
# }}}
|
||||
|
||||
def is_guide_ref_start(ref):
|
||||
return (ref.title.lower() == 'start' or
|
||||
(ref.type and ref.type.lower() in {'start',
|
||||
'other.start', 'text'}))
|
||||
|
||||
|
@ -12,5 +12,4 @@ UNCOMPRESSED = 1
|
||||
PALMDOC = 2
|
||||
HUFFDIC = 17480
|
||||
PALM_MAX_IMAGE_SIZE = 63 * 1024
|
||||
RECORD_SIZE = 0x1000 # 4096 (Text record size (uncompressed))
|
||||
|
||||
|
@ -12,56 +12,22 @@ from struct import pack
|
||||
from cStringIO import StringIO
|
||||
from collections import OrderedDict, defaultdict
|
||||
|
||||
from calibre.ebooks.mobi.writer2 import RECORD_SIZE
|
||||
from calibre.ebooks.mobi.utils import (encint, encode_number_as_hex,
|
||||
encode_tbs, align_block, utf8_text)
|
||||
encode_tbs, align_block, RECORD_SIZE, CNCX as CNCX_)
|
||||
|
||||
class CNCX(object): # {{{
|
||||
|
||||
'''
|
||||
Create the CNCX records. These are records containing all the strings from
|
||||
the NCX. Each record is of the form: <vwi string size><utf-8 encoded
|
||||
string>
|
||||
'''
|
||||
|
||||
MAX_STRING_LENGTH = 500
|
||||
class CNCX(CNCX_): # {{{
|
||||
|
||||
def __init__(self, toc, is_periodical):
|
||||
self.strings = OrderedDict()
|
||||
|
||||
strings = []
|
||||
for item in toc.iterdescendants(breadth_first=True):
|
||||
self.strings[item.title] = 0
|
||||
strings.append(item.title)
|
||||
if is_periodical:
|
||||
self.strings[item.klass] = 0
|
||||
strings.append(item.klass)
|
||||
if item.author:
|
||||
self.strings[item.author] = 0
|
||||
strings.append(item.author)
|
||||
if item.description:
|
||||
self.strings[item.description] = 0
|
||||
|
||||
self.records = []
|
||||
offset = 0
|
||||
buf = StringIO()
|
||||
for key in tuple(self.strings.iterkeys()):
|
||||
utf8 = utf8_text(key[:self.MAX_STRING_LENGTH])
|
||||
l = len(utf8)
|
||||
sz_bytes = encint(l)
|
||||
raw = sz_bytes + utf8
|
||||
if 0xfbf8 - buf.tell() < 6 + len(raw):
|
||||
# Records in PDB files cannot be larger than 0x10000, so we
|
||||
# stop well before that.
|
||||
pad = 0xfbf8 - buf.tell()
|
||||
buf.write(b'\0' * pad)
|
||||
self.records.append(buf.getvalue())
|
||||
buf.truncate(0)
|
||||
offset = len(self.records) * 0x10000
|
||||
buf.write(raw)
|
||||
self.strings[key] = offset
|
||||
offset += len(raw)
|
||||
|
||||
self.records.append(align_block(buf.getvalue()))
|
||||
|
||||
def __getitem__(self, string):
|
||||
return self.strings[string]
|
||||
strings.append(item.description)
|
||||
CNCX_.__init__(self, strings)
|
||||
# }}}
|
||||
|
||||
class TAGX(object): # {{{
|
||||
@ -534,14 +500,14 @@ class Indexer(object): # {{{
|
||||
|
||||
# Write offsets to index entries as an IDXT block
|
||||
idxt_block = b'IDXT'
|
||||
buf.truncate(0)
|
||||
buf.seek(0), buf.truncate(0)
|
||||
for offset in offsets:
|
||||
buf.write(pack(b'>H', header_length+offset))
|
||||
idxt_block = align_block(idxt_block + buf.getvalue())
|
||||
body = index_block + idxt_block
|
||||
|
||||
header = b'INDX'
|
||||
buf.truncate(0)
|
||||
buf.seek(0), buf.truncate(0)
|
||||
buf.write(pack(b'>I', header_length))
|
||||
buf.write(b'\0'*4) # Unknown
|
||||
buf.write(pack(b'>I', 1)) # Header type? Or index record number?
|
||||
|
@ -7,51 +7,31 @@ __license__ = 'GPL v3'
|
||||
__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import re, random, time
|
||||
import random, time
|
||||
from cStringIO import StringIO
|
||||
from struct import pack
|
||||
|
||||
from calibre.ebooks import normalize, generate_masthead
|
||||
from calibre.ebooks.oeb.base import OEB_RASTER_IMAGES
|
||||
from calibre.ebooks import normalize
|
||||
from calibre.ebooks.mobi.writer2.serializer import Serializer
|
||||
from calibre.ebooks.compression.palmdoc import compress_doc
|
||||
from calibre.ebooks.mobi.langcodes import iana2mobi
|
||||
from calibre.utils.filenames import ascii_filename
|
||||
from calibre.ebooks.mobi.writer2 import (PALMDOC, UNCOMPRESSED, RECORD_SIZE)
|
||||
from calibre.ebooks.mobi.utils import (rescale_image, encint, mobify_image,
|
||||
encode_trailing_data, align_block, detect_periodical)
|
||||
from calibre.ebooks.mobi.writer2 import (PALMDOC, UNCOMPRESSED)
|
||||
from calibre.ebooks.mobi.utils import (encint, encode_trailing_data,
|
||||
align_block, detect_periodical, RECORD_SIZE, create_text_record)
|
||||
from calibre.ebooks.mobi.writer2.indexer import Indexer
|
||||
from calibre.ebooks.mobi import MAX_THUMB_DIMEN, MAX_THUMB_SIZE
|
||||
|
||||
EXTH_CODES = {
|
||||
'creator': 100,
|
||||
'publisher': 101,
|
||||
'description': 103,
|
||||
'identifier': 104,
|
||||
'subject': 105,
|
||||
'pubdate': 106,
|
||||
'review': 107,
|
||||
'contributor': 108,
|
||||
'rights': 109,
|
||||
'type': 111,
|
||||
'source': 112,
|
||||
'versionnumber': 114,
|
||||
'startreading': 116,
|
||||
'coveroffset': 201,
|
||||
'thumboffset': 202,
|
||||
'hasfakecover': 203,
|
||||
'lastupdatetime': 502,
|
||||
'title': 503,
|
||||
}
|
||||
|
||||
# Disabled as I dont care about uncrossable breaks
|
||||
WRITE_UNCROSSABLE_BREAKS = False
|
||||
NULL_INDEX = 0xffffffff
|
||||
|
||||
class MobiWriter(object):
|
||||
COLLAPSE_RE = re.compile(r'[ \t\r\n\v]+')
|
||||
|
||||
def __init__(self, opts, write_page_breaks_after_item=True):
|
||||
def __init__(self, opts, resources, kf8, write_page_breaks_after_item=True):
|
||||
self.opts = opts
|
||||
self.resources = resources
|
||||
self.kf8 = kf8
|
||||
self.for_joint = kf8 is not None
|
||||
self.write_page_breaks_after_item = write_page_breaks_after_item
|
||||
self.compression = UNCOMPRESSED if opts.dont_compress else PALMDOC
|
||||
self.prefer_author_sort = opts.prefer_author_sort
|
||||
@ -83,7 +63,7 @@ class MobiWriter(object):
|
||||
self.stream = stream
|
||||
self.records = [None]
|
||||
self.generate_content()
|
||||
self.generate_record0()
|
||||
self.generate_joint_record0() if self.for_joint else self.generate_record0()
|
||||
self.write_header()
|
||||
self.write_content()
|
||||
|
||||
@ -151,73 +131,19 @@ class MobiWriter(object):
|
||||
# Images {{{
|
||||
|
||||
def generate_images(self):
|
||||
oeb = self.oeb
|
||||
oeb.logger.info('Serializing images...')
|
||||
self.image_records = []
|
||||
self.image_map = {}
|
||||
self.masthead_offset = 0
|
||||
index = 1
|
||||
resources = self.resources
|
||||
image_records = resources.records
|
||||
self.image_map = resources.item_map
|
||||
self.masthead_offset = resources.masthead_offset
|
||||
self.cover_offset = resources.cover_offset
|
||||
self.thumbnail_offset = resources.thumbnail_offset
|
||||
|
||||
mh_href = None
|
||||
if 'masthead' in oeb.guide and oeb.guide['masthead'].href:
|
||||
mh_href = oeb.guide['masthead'].href
|
||||
self.image_records.append(None)
|
||||
index += 1
|
||||
elif self.is_periodical:
|
||||
# Generate a default masthead
|
||||
data = generate_masthead(unicode(self.oeb.metadata['title'][0]))
|
||||
self.image_records.append(data)
|
||||
index += 1
|
||||
|
||||
cover_href = self.cover_offset = self.thumbnail_offset = None
|
||||
if (oeb.metadata.cover and
|
||||
unicode(oeb.metadata.cover[0]) in oeb.manifest.ids):
|
||||
cover_id = unicode(oeb.metadata.cover[0])
|
||||
item = oeb.manifest.ids[cover_id]
|
||||
cover_href = item.href
|
||||
|
||||
for item in self.oeb.manifest.values():
|
||||
if item.media_type not in OEB_RASTER_IMAGES: continue
|
||||
try:
|
||||
data = item.data
|
||||
if self.opts.mobi_keep_original_images:
|
||||
data = mobify_image(data)
|
||||
else:
|
||||
data = rescale_image(data)
|
||||
except:
|
||||
oeb.logger.warn('Bad image file %r' % item.href)
|
||||
continue
|
||||
else:
|
||||
if mh_href and item.href == mh_href:
|
||||
self.image_records[0] = data
|
||||
continue
|
||||
|
||||
self.image_records.append(data)
|
||||
self.image_map[item.href] = index
|
||||
index += 1
|
||||
|
||||
if cover_href and item.href == cover_href:
|
||||
self.cover_offset = self.image_map[item.href] - 1
|
||||
try:
|
||||
data = rescale_image(item.data, dimen=MAX_THUMB_DIMEN,
|
||||
maxsizeb=MAX_THUMB_SIZE)
|
||||
except:
|
||||
oeb.logger.warn('Failed to generate thumbnail')
|
||||
else:
|
||||
self.image_records.append(data)
|
||||
self.thumbnail_offset = index - 1
|
||||
index += 1
|
||||
finally:
|
||||
item.unload_data_from_memory()
|
||||
|
||||
if self.image_records and self.image_records[0] is None:
|
||||
if image_records and image_records[0] is None:
|
||||
raise ValueError('Failed to find masthead image in manifest')
|
||||
|
||||
# }}}
|
||||
|
||||
# Text {{{
|
||||
|
||||
def generate_text(self):
|
||||
def generate_text(self): # {{{
|
||||
self.oeb.logger.info('Serializing markup content...')
|
||||
self.serializer = Serializer(self.oeb, self.image_map,
|
||||
self.is_periodical,
|
||||
@ -232,7 +158,7 @@ class MobiWriter(object):
|
||||
self.oeb.logger.info(' Compressing markup content...')
|
||||
|
||||
while text.tell() < self.text_length:
|
||||
data, overlap = self.read_text_record(text)
|
||||
data, overlap = create_text_record(text)
|
||||
if self.compression == PALMDOC:
|
||||
data = compress_doc(data)
|
||||
|
||||
@ -249,57 +175,6 @@ class MobiWriter(object):
|
||||
if records_size % 4 != 0:
|
||||
self.records.append(b'\x00'*(records_size % 4))
|
||||
self.first_non_text_record_idx += 1
|
||||
|
||||
def read_text_record(self, text):
|
||||
'''
|
||||
Return a Palmdoc record of size RECORD_SIZE from the text file object.
|
||||
In case the record ends in the middle of a multibyte character return
|
||||
the overlap as well.
|
||||
|
||||
Returns data, overlap: where both are byte strings. overlap is the
|
||||
extra bytes needed to complete the truncated multibyte character.
|
||||
'''
|
||||
opos = text.tell()
|
||||
text.seek(0, 2)
|
||||
# npos is the position of the next record
|
||||
npos = min((opos + RECORD_SIZE, text.tell()))
|
||||
# Number of bytes from the next record needed to complete the last
|
||||
# character in this record
|
||||
extra = 0
|
||||
|
||||
last = b''
|
||||
while not last.decode('utf-8', 'ignore'):
|
||||
# last contains no valid utf-8 characters
|
||||
size = len(last) + 1
|
||||
text.seek(npos - size)
|
||||
last = text.read(size)
|
||||
|
||||
# last now has one valid utf-8 char and possibly some bytes that belong
|
||||
# to a truncated char
|
||||
|
||||
try:
|
||||
last.decode('utf-8', 'strict')
|
||||
except UnicodeDecodeError:
|
||||
# There are some truncated bytes in last
|
||||
prev = len(last)
|
||||
while True:
|
||||
text.seek(npos - prev)
|
||||
last = text.read(len(last) + 1)
|
||||
try:
|
||||
last.decode('utf-8')
|
||||
except UnicodeDecodeError:
|
||||
pass
|
||||
else:
|
||||
break
|
||||
extra = len(last) - prev
|
||||
|
||||
text.seek(opos)
|
||||
data = text.read(RECORD_SIZE)
|
||||
overlap = text.read(extra)
|
||||
text.seek(npos)
|
||||
|
||||
return data, overlap
|
||||
|
||||
# }}}
|
||||
|
||||
def generate_record0(self): # MOBI header {{{
|
||||
@ -315,11 +190,20 @@ class MobiWriter(object):
|
||||
# header as well
|
||||
bt = 0x103 if self.indexer.is_flat_periodical else 0x101
|
||||
|
||||
exth = self.build_exth(bt)
|
||||
from calibre.ebooks.mobi.writer8.exth import build_exth
|
||||
exth = build_exth(metadata,
|
||||
prefer_author_sort=self.opts.prefer_author_sort,
|
||||
is_periodical=self.is_periodical,
|
||||
share_not_sync=self.opts.share_not_sync,
|
||||
cover_offset=self.cover_offset,
|
||||
thumbnail_offset=self.thumbnail_offset,
|
||||
start_offset=self.serializer.start_offset, mobi_doctype=bt
|
||||
)
|
||||
first_image_record = None
|
||||
if self.image_records:
|
||||
if self.resources:
|
||||
used_images = self.serializer.used_images
|
||||
first_image_record = len(self.records)
|
||||
self.records.extend(self.image_records)
|
||||
self.resources.serialize(self.records, used_images)
|
||||
last_content_record = len(self.records) - 1
|
||||
|
||||
# FCIS/FLIS (Seems to serve no purpose)
|
||||
@ -481,125 +365,72 @@ class MobiWriter(object):
|
||||
self.records[0] = align_block(record0)
|
||||
# }}}
|
||||
|
||||
def build_exth(self, mobi_doctype): # EXTH Header {{{
|
||||
oeb = self.oeb
|
||||
exth = StringIO()
|
||||
nrecs = 0
|
||||
for term in oeb.metadata:
|
||||
if term not in EXTH_CODES: continue
|
||||
code = EXTH_CODES[term]
|
||||
items = oeb.metadata[term]
|
||||
if term == 'creator':
|
||||
if self.prefer_author_sort:
|
||||
creators = [normalize(unicode(c.file_as or c)) for c in
|
||||
items][:1]
|
||||
else:
|
||||
creators = [normalize(unicode(c)) for c in items]
|
||||
items = ['; '.join(creators)]
|
||||
for item in items:
|
||||
data = normalize(unicode(item))
|
||||
if term != 'description':
|
||||
data = self.COLLAPSE_RE.sub(' ', data)
|
||||
if term == 'identifier':
|
||||
if data.lower().startswith('urn:isbn:'):
|
||||
data = data[9:]
|
||||
elif item.scheme.lower() == 'isbn':
|
||||
pass
|
||||
else:
|
||||
continue
|
||||
data = data.encode('utf-8')
|
||||
exth.write(pack(b'>II', code, len(data) + 8))
|
||||
exth.write(data)
|
||||
nrecs += 1
|
||||
if term == 'rights' :
|
||||
try:
|
||||
rights = normalize(unicode(oeb.metadata.rights[0])).encode('utf-8')
|
||||
except:
|
||||
rights = b'Unknown'
|
||||
exth.write(pack(b'>II', EXTH_CODES['rights'], len(rights) + 8))
|
||||
exth.write(rights)
|
||||
nrecs += 1
|
||||
def generate_joint_record0(self): # {{{
|
||||
from calibre.ebooks.mobi.writer8.mobi import (MOBIHeader,
|
||||
HEADER_FIELDS)
|
||||
from calibre.ebooks.mobi.writer8.exth import build_exth
|
||||
|
||||
# Write UUID as ASIN
|
||||
uuid = None
|
||||
from calibre.ebooks.oeb.base import OPF
|
||||
for x in oeb.metadata['identifier']:
|
||||
if (x.get(OPF('scheme'), None).lower() == 'uuid' or
|
||||
unicode(x).startswith('urn:uuid:')):
|
||||
uuid = unicode(x).split(':')[-1]
|
||||
break
|
||||
if uuid is None:
|
||||
from uuid import uuid4
|
||||
uuid = str(uuid4())
|
||||
# Insert resource records
|
||||
first_image_record = None
|
||||
old = len(self.records)
|
||||
if self.resources:
|
||||
used_images = self.serializer.used_images | self.kf8.used_images
|
||||
first_image_record = len(self.records)
|
||||
self.resources.serialize(self.records, used_images)
|
||||
resource_record_count = len(self.records) - old
|
||||
|
||||
if isinstance(uuid, unicode):
|
||||
uuid = uuid.encode('utf-8')
|
||||
if not self.opts.share_not_sync:
|
||||
exth.write(pack(b'>II', 113, len(uuid) + 8))
|
||||
exth.write(uuid)
|
||||
nrecs += 1
|
||||
# Insert KF8 records
|
||||
self.records.append(b'BOUNDARY')
|
||||
kf8_header_index = len(self.records)
|
||||
self.kf8.start_offset = (self.serializer.start_offset,
|
||||
self.kf8.start_offset)
|
||||
self.records.append(self.kf8.record0)
|
||||
self.records.extend(self.kf8.records[1:])
|
||||
|
||||
# Write cdetype
|
||||
if not self.is_periodical:
|
||||
if not self.opts.share_not_sync:
|
||||
exth.write(pack(b'>II', 501, 12))
|
||||
exth.write(b'EBOK')
|
||||
nrecs += 1
|
||||
else:
|
||||
ids = {0x101:b'NWPR', 0x103:b'MAGZ'}.get(mobi_doctype, None)
|
||||
if ids:
|
||||
exth.write(pack(b'>II', 501, 12))
|
||||
exth.write(ids)
|
||||
nrecs += 1
|
||||
first_image_record = (first_image_record if first_image_record else
|
||||
len(self.records))
|
||||
|
||||
# Add a publication date entry
|
||||
if oeb.metadata['date']:
|
||||
datestr = str(oeb.metadata['date'][0])
|
||||
elif oeb.metadata['timestamp']:
|
||||
datestr = str(oeb.metadata['timestamp'][0])
|
||||
header_fields = {k:getattr(self.kf8, k) for k in HEADER_FIELDS}
|
||||
|
||||
if datestr is None:
|
||||
raise ValueError("missing date or timestamp")
|
||||
# Now change the header fields that need to be different in the MOBI 6
|
||||
# header
|
||||
header_fields['first_resource_record'] = first_image_record
|
||||
header_fields['exth_flags'] = 0b100001010000 # Kinglegen uses this
|
||||
header_fields['fdst_record'] = NULL_INDEX
|
||||
header_fields['fdst_count'] = 1 # Why not 0? Kindlegen uses 1
|
||||
extra_data_flags = 0b1 # Has multibyte overlap bytes
|
||||
if self.primary_index_record_idx is not None:
|
||||
extra_data_flags |= 0b10
|
||||
header_fields['extra_data_flags'] = extra_data_flags
|
||||
|
||||
datestr = bytes(datestr)
|
||||
exth.write(pack(b'>II', EXTH_CODES['pubdate'], len(datestr) + 8))
|
||||
exth.write(datestr)
|
||||
nrecs += 1
|
||||
if self.is_periodical:
|
||||
exth.write(pack(b'>II', EXTH_CODES['lastupdatetime'], len(datestr) + 8))
|
||||
exth.write(datestr)
|
||||
nrecs += 1
|
||||
for k, v in {'last_text_record':'last_text_record_idx',
|
||||
'first_non_text_record':'first_non_text_record_idx',
|
||||
'ncx_index':'primary_index_record_idx',
|
||||
}.iteritems():
|
||||
header_fields[k] = getattr(self, v)
|
||||
if header_fields['ncx_index'] is None:
|
||||
header_fields['ncx_index'] = NULL_INDEX
|
||||
|
||||
if self.is_periodical:
|
||||
# Pretend to be amazon's super secret periodical generator
|
||||
vals = {204:201, 205:2, 206:0, 207:101}
|
||||
else:
|
||||
# Pretend to be kindlegen 1.2
|
||||
vals = {204:201, 205:1, 206:2, 207:33307}
|
||||
for code, val in vals.iteritems():
|
||||
exth.write(pack(b'>III', code, 12, val))
|
||||
nrecs += 1
|
||||
for x in ('skel', 'chunk', 'guide'):
|
||||
header_fields[x+'_index'] = NULL_INDEX
|
||||
|
||||
if self.cover_offset is not None:
|
||||
exth.write(pack(b'>III', EXTH_CODES['coveroffset'], 12,
|
||||
self.cover_offset))
|
||||
exth.write(pack(b'>III', EXTH_CODES['hasfakecover'], 12, 0))
|
||||
nrecs += 2
|
||||
if self.thumbnail_offset is not None:
|
||||
exth.write(pack(b'>III', EXTH_CODES['thumboffset'], 12,
|
||||
self.thumbnail_offset))
|
||||
nrecs += 1
|
||||
# Create the MOBI 6 EXTH
|
||||
opts = self.opts
|
||||
kuc = 0 if resource_record_count > 0 else None
|
||||
|
||||
if self.serializer.start_offset is not None:
|
||||
exth.write(pack(b'>III', EXTH_CODES['startreading'], 12,
|
||||
self.serializer.start_offset))
|
||||
nrecs += 1
|
||||
header_fields['exth'] = build_exth(self.oeb.metadata,
|
||||
prefer_author_sort=opts.prefer_author_sort,
|
||||
is_periodical=opts.mobi_periodical,
|
||||
share_not_sync=opts.share_not_sync,
|
||||
cover_offset=self.cover_offset,
|
||||
thumbnail_offset=self.thumbnail_offset,
|
||||
num_of_resources=resource_record_count,
|
||||
kf8_unknown_count=kuc, be_kindlegen2=True,
|
||||
kf8_header_index=kf8_header_index,
|
||||
start_offset=self.serializer.start_offset,
|
||||
mobi_doctype=2)
|
||||
self.records[0] = MOBIHeader(file_version=6)(**header_fields)
|
||||
|
||||
exth = exth.getvalue()
|
||||
trail = len(exth) % 4
|
||||
pad = b'\0' * (4 - trail) # Always pad w/ at least 1 byte
|
||||
exth = [b'EXTH', pack(b'>II', len(exth) + 12, nrecs), exth, pad]
|
||||
return b''.join(exth)
|
||||
# }}}
|
||||
|
||||
def write_header(self): # PalmDB header {{{
|
||||
|
136
src/calibre/ebooks/mobi/writer2/resources.py
Normal file
136
src/calibre/ebooks/mobi/writer2/resources.py
Normal file
@ -0,0 +1,136 @@
|
||||
#!/usr/bin/env python
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
from __future__ import (unicode_literals, division, absolute_import,
|
||||
print_function)
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import imghdr
|
||||
|
||||
from calibre.ebooks.mobi import MAX_THUMB_DIMEN, MAX_THUMB_SIZE
|
||||
from calibre.ebooks.mobi.utils import (rescale_image, mobify_image,
|
||||
write_font_record)
|
||||
from calibre.ebooks import generate_masthead
|
||||
from calibre.ebooks.oeb.base import OEB_RASTER_IMAGES
|
||||
|
||||
PLACEHOLDER_GIF = b'GIF89a\x01\x00\x01\x00\x80\x00\x00\x00\x00\x00\xff\xff\xff!\xf9\x04\x01\x00\x00\x00\x00,\x00\x00\x00\x00\x01\x00\x01\x00@\x02\x01D\x00;'
|
||||
|
||||
class Resources(object):
|
||||
|
||||
def __init__(self, oeb, opts, is_periodical, add_fonts=False):
|
||||
self.oeb, self.log, self.opts = oeb, oeb.log, opts
|
||||
self.is_periodical = is_periodical
|
||||
|
||||
self.item_map = {}
|
||||
self.records = []
|
||||
self.mime_map = {}
|
||||
self.masthead_offset = 0
|
||||
self.used_image_indices = set()
|
||||
self.image_indices = set()
|
||||
self.cover_offset = self.thumbnail_offset = None
|
||||
|
||||
self.add_resources(add_fonts)
|
||||
|
||||
def process_image(self, data):
|
||||
return (mobify_image(data) if self.opts.mobi_keep_original_images else
|
||||
rescale_image(data))
|
||||
|
||||
def add_resources(self, add_fonts):
|
||||
oeb = self.oeb
|
||||
oeb.logger.info('Serializing resources...')
|
||||
index = 1
|
||||
|
||||
mh_href = None
|
||||
if 'masthead' in oeb.guide and oeb.guide['masthead'].href:
|
||||
mh_href = oeb.guide['masthead'].href
|
||||
self.records.append(None)
|
||||
index += 1
|
||||
self.used_image_indices.add(0)
|
||||
self.image_indices.add(0)
|
||||
elif self.is_periodical:
|
||||
# Generate a default masthead
|
||||
data = generate_masthead(unicode(self.oeb.metadata['title'][0]))
|
||||
self.records.append(data)
|
||||
self.used_image_indices.add(0)
|
||||
self.image_indices.add(0)
|
||||
index += 1
|
||||
|
||||
cover_href = self.cover_offset = self.thumbnail_offset = None
|
||||
if (oeb.metadata.cover and
|
||||
unicode(oeb.metadata.cover[0]) in oeb.manifest.ids):
|
||||
cover_id = unicode(oeb.metadata.cover[0])
|
||||
item = oeb.manifest.ids[cover_id]
|
||||
cover_href = item.href
|
||||
|
||||
for item in self.oeb.manifest.values():
|
||||
if item.media_type not in OEB_RASTER_IMAGES: continue
|
||||
try:
|
||||
data = self.process_image(item.data)
|
||||
except:
|
||||
self.log.warn('Bad image file %r' % item.href)
|
||||
continue
|
||||
else:
|
||||
if mh_href and item.href == mh_href:
|
||||
self.records[0] = data
|
||||
continue
|
||||
|
||||
self.image_indices.add(len(self.records))
|
||||
self.records.append(data)
|
||||
self.item_map[item.href] = index
|
||||
self.mime_map[item.href] = 'image/%s'%imghdr.what(None, data)
|
||||
index += 1
|
||||
|
||||
if cover_href and item.href == cover_href:
|
||||
self.cover_offset = self.item_map[item.href] - 1
|
||||
self.used_image_indices.add(self.cover_offset)
|
||||
try:
|
||||
data = rescale_image(item.data, dimen=MAX_THUMB_DIMEN,
|
||||
maxsizeb=MAX_THUMB_SIZE)
|
||||
except:
|
||||
self.log.warn('Failed to generate thumbnail')
|
||||
else:
|
||||
self.image_indices.add(len(self.records))
|
||||
self.records.append(data)
|
||||
self.thumbnail_offset = index - 1
|
||||
self.used_image_indices.add(self.thumbnail_offset)
|
||||
index += 1
|
||||
finally:
|
||||
item.unload_data_from_memory()
|
||||
|
||||
if add_fonts:
|
||||
for item in self.oeb.manifest.values():
|
||||
if item.href and item.href.rpartition('.')[-1].lower() in {
|
||||
'ttf', 'otf'} and isinstance(item.data, bytes):
|
||||
self.records.append(write_font_record(item.data))
|
||||
self.item_map[item.href] = len(self.records)
|
||||
|
||||
def add_extra_images(self):
|
||||
'''
|
||||
Add any images that were created after the call to add_resources()
|
||||
'''
|
||||
for item in self.oeb.manifest.values():
|
||||
if (item.media_type not in OEB_RASTER_IMAGES or item.href in
|
||||
self.item_map): continue
|
||||
try:
|
||||
data = self.process_image(item.data)
|
||||
except:
|
||||
self.log.warn('Bad image file %r' % item.href)
|
||||
else:
|
||||
self.records.append(data)
|
||||
self.item_map[item.href] = len(self.records)
|
||||
finally:
|
||||
item.unload_data_from_memory()
|
||||
|
||||
def serialize(self, records, used_images):
|
||||
used_image_indices = self.used_image_indices | {
|
||||
v-1 for k, v in self.item_map.iteritems() if k in used_images}
|
||||
for i in self.image_indices-used_image_indices:
|
||||
self.records[i] = PLACEHOLDER_GIF
|
||||
records.extend(self.records)
|
||||
|
||||
def __bool__(self):
|
||||
return bool(self.records)
|
||||
__nonzero__ = __bool__
|
||||
|
@ -12,6 +12,7 @@ import re
|
||||
from calibre.ebooks.oeb.base import (OEB_DOCS, XHTML, XHTML_NS, XML_NS,
|
||||
namespace, prefixname, urlnormalize)
|
||||
from calibre.ebooks.mobi.mobiml import MBP_NS
|
||||
from calibre.ebooks.mobi.utils import is_guide_ref_start
|
||||
|
||||
from collections import defaultdict
|
||||
from urlparse import urldefrag
|
||||
@ -39,6 +40,7 @@ class Serializer(object):
|
||||
self.oeb = oeb
|
||||
# Map of image hrefs to image index in the MOBI file
|
||||
self.images = images
|
||||
self.used_images = set()
|
||||
self.logger = oeb.logger
|
||||
self.is_periodical = is_periodical
|
||||
self.write_page_breaks_after_item = write_page_breaks_after_item
|
||||
@ -160,9 +162,7 @@ class Serializer(object):
|
||||
buf.write(b'title="')
|
||||
self.serialize_text(ref.title, quot=True)
|
||||
buf.write(b'" ')
|
||||
if (ref.title.lower() == 'start' or
|
||||
(ref.type and ref.type.lower() in {'start',
|
||||
'other.start', 'text'})):
|
||||
if is_guide_ref_start(ref):
|
||||
self._start_href = ref.href
|
||||
self.serialize_href(ref.href)
|
||||
# Space required or won't work, I kid you not
|
||||
@ -329,6 +329,7 @@ class Serializer(object):
|
||||
href = urlnormalize(item.abshref(val))
|
||||
if href in self.images:
|
||||
index = self.images[href]
|
||||
self.used_images.add(href)
|
||||
buf.write(b'recindex="%05d"' % index)
|
||||
continue
|
||||
buf.write(attr.encode('utf-8'))
|
||||
|
11
src/calibre/ebooks/mobi/writer8/__init__.py
Normal file
11
src/calibre/ebooks/mobi/writer8/__init__.py
Normal file
@ -0,0 +1,11 @@
|
||||
#!/usr/bin/env python
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
from __future__ import (unicode_literals, division, absolute_import,
|
||||
print_function)
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
|
||||
|
188
src/calibre/ebooks/mobi/writer8/exth.py
Normal file
188
src/calibre/ebooks/mobi/writer8/exth.py
Normal file
@ -0,0 +1,188 @@
|
||||
#!/usr/bin/env python
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
from __future__ import (unicode_literals, division, absolute_import,
|
||||
print_function)
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import re
|
||||
from struct import pack
|
||||
from io import BytesIO
|
||||
|
||||
from calibre.ebooks.mobi.utils import utf8_text
|
||||
|
||||
EXTH_CODES = {
|
||||
'creator': 100,
|
||||
'publisher': 101,
|
||||
'description': 103,
|
||||
'identifier': 104,
|
||||
'subject': 105,
|
||||
'pubdate': 106,
|
||||
'review': 107,
|
||||
'contributor': 108,
|
||||
'rights': 109,
|
||||
'type': 111,
|
||||
'source': 112,
|
||||
'versionnumber': 114,
|
||||
'startreading': 116,
|
||||
'kf8_header_index': 121,
|
||||
'num_of_resources': 125,
|
||||
'kf8_unknown_count': 131,
|
||||
'coveroffset': 201,
|
||||
'thumboffset': 202,
|
||||
'hasfakecover': 203,
|
||||
'lastupdatetime': 502,
|
||||
'title': 503,
|
||||
}
|
||||
|
||||
COLLAPSE_RE = re.compile(r'[ \t\r\n\v]+')
|
||||
|
||||
def build_exth(metadata, prefer_author_sort=False, is_periodical=False,
|
||||
share_not_sync=True, cover_offset=None, thumbnail_offset=None,
|
||||
start_offset=None, mobi_doctype=2, num_of_resources=None,
|
||||
kf8_unknown_count=0, be_kindlegen2=False, kf8_header_index=None):
|
||||
exth = BytesIO()
|
||||
nrecs = 0
|
||||
|
||||
for term in metadata:
|
||||
if term not in EXTH_CODES: continue
|
||||
code = EXTH_CODES[term]
|
||||
items = metadata[term]
|
||||
if term == 'creator':
|
||||
if prefer_author_sort:
|
||||
creators = [unicode(c.file_as or c) for c in
|
||||
items][:1]
|
||||
else:
|
||||
creators = [unicode(c) for c in items]
|
||||
items = ['; '.join(creators)]
|
||||
for item in items:
|
||||
data = unicode(item)
|
||||
if term != 'description':
|
||||
data = COLLAPSE_RE.sub(' ', data)
|
||||
if term == 'identifier':
|
||||
if data.lower().startswith('urn:isbn:'):
|
||||
data = data[9:]
|
||||
elif item.scheme.lower() == 'isbn':
|
||||
pass
|
||||
else:
|
||||
continue
|
||||
data = utf8_text(data)
|
||||
exth.write(pack(b'>II', code, len(data) + 8))
|
||||
exth.write(data)
|
||||
nrecs += 1
|
||||
if term == 'rights' :
|
||||
try:
|
||||
rights = utf8_text(unicode(metadata.rights[0]))
|
||||
except:
|
||||
rights = b'Unknown'
|
||||
exth.write(pack(b'>II', EXTH_CODES['rights'], len(rights) + 8))
|
||||
exth.write(rights)
|
||||
nrecs += 1
|
||||
|
||||
# Write UUID as ASIN
|
||||
uuid = None
|
||||
from calibre.ebooks.oeb.base import OPF
|
||||
for x in metadata['identifier']:
|
||||
if (x.get(OPF('scheme'), None).lower() == 'uuid' or
|
||||
unicode(x).startswith('urn:uuid:')):
|
||||
uuid = unicode(x).split(':')[-1]
|
||||
break
|
||||
if uuid is None:
|
||||
from uuid import uuid4
|
||||
uuid = str(uuid4())
|
||||
|
||||
if isinstance(uuid, unicode):
|
||||
uuid = uuid.encode('utf-8')
|
||||
if not share_not_sync:
|
||||
exth.write(pack(b'>II', 113, len(uuid) + 8))
|
||||
exth.write(uuid)
|
||||
nrecs += 1
|
||||
|
||||
# Write cdetype
|
||||
if not is_periodical:
|
||||
if not share_not_sync:
|
||||
exth.write(pack(b'>II', 501, 12))
|
||||
exth.write(b'EBOK')
|
||||
nrecs += 1
|
||||
else:
|
||||
ids = {0x101:b'NWPR', 0x103:b'MAGZ'}.get(mobi_doctype, None)
|
||||
if ids:
|
||||
exth.write(pack(b'>II', 501, 12))
|
||||
exth.write(ids)
|
||||
nrecs += 1
|
||||
|
||||
# Add a publication date entry
|
||||
if metadata['date']:
|
||||
datestr = str(metadata['date'][0])
|
||||
elif metadata['timestamp']:
|
||||
datestr = str(metadata['timestamp'][0])
|
||||
|
||||
if datestr is None:
|
||||
raise ValueError("missing date or timestamp")
|
||||
|
||||
datestr = bytes(datestr)
|
||||
exth.write(pack(b'>II', EXTH_CODES['pubdate'], len(datestr) + 8))
|
||||
exth.write(datestr)
|
||||
nrecs += 1
|
||||
if is_periodical:
|
||||
exth.write(pack(b'>II', EXTH_CODES['lastupdatetime'], len(datestr) + 8))
|
||||
exth.write(datestr)
|
||||
nrecs += 1
|
||||
|
||||
if be_kindlegen2:
|
||||
vals = {204:201, 205:2, 206:2, 207:35621}
|
||||
elif is_periodical:
|
||||
# Pretend to be amazon's super secret periodical generator
|
||||
vals = {204:201, 205:2, 206:0, 207:101}
|
||||
else:
|
||||
# Pretend to be kindlegen 1.2
|
||||
vals = {204:201, 205:1, 206:2, 207:33307}
|
||||
for code, val in vals.iteritems():
|
||||
exth.write(pack(b'>III', code, 12, val))
|
||||
nrecs += 1
|
||||
|
||||
if cover_offset is not None:
|
||||
exth.write(pack(b'>III', EXTH_CODES['coveroffset'], 12,
|
||||
cover_offset))
|
||||
exth.write(pack(b'>III', EXTH_CODES['hasfakecover'], 12, 0))
|
||||
nrecs += 2
|
||||
if thumbnail_offset is not None:
|
||||
exth.write(pack(b'>III', EXTH_CODES['thumboffset'], 12,
|
||||
thumbnail_offset))
|
||||
nrecs += 1
|
||||
|
||||
if start_offset is not None:
|
||||
try:
|
||||
len(start_offset)
|
||||
except TypeError:
|
||||
start_offset = [start_offset]
|
||||
for so in start_offset:
|
||||
if so is not None:
|
||||
exth.write(pack(b'>III', EXTH_CODES['startreading'], 12,
|
||||
so))
|
||||
nrecs += 1
|
||||
|
||||
if kf8_header_index is not None:
|
||||
exth.write(pack(b'>III', EXTH_CODES['kf8_header_index'], 12,
|
||||
kf8_header_index))
|
||||
nrecs += 1
|
||||
|
||||
if num_of_resources is not None:
|
||||
exth.write(pack(b'>III', EXTH_CODES['num_of_resources'], 12,
|
||||
num_of_resources))
|
||||
nrecs += 1
|
||||
|
||||
if kf8_unknown_count is not None:
|
||||
exth.write(pack(b'>III', EXTH_CODES['kf8_unknown_count'], 12,
|
||||
kf8_unknown_count))
|
||||
nrecs += 1
|
||||
|
||||
exth = exth.getvalue()
|
||||
trail = len(exth) % 4
|
||||
pad = b'\0' * (4 - trail) # Always pad w/ at least 1 byte
|
||||
exth = [b'EXTH', pack(b'>II', len(exth) + 12, nrecs), exth, pad]
|
||||
return b''.join(exth)
|
||||
|
||||
|
86
src/calibre/ebooks/mobi/writer8/header.py
Normal file
86
src/calibre/ebooks/mobi/writer8/header.py
Normal file
@ -0,0 +1,86 @@
|
||||
#!/usr/bin/env python
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
from __future__ import (unicode_literals, division, absolute_import,
|
||||
print_function)
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import random
|
||||
from io import BytesIO
|
||||
from collections import OrderedDict
|
||||
from struct import pack
|
||||
|
||||
from calibre.ebooks.mobi.utils import align_block
|
||||
|
||||
NULL = 0xffffffff
|
||||
zeroes = lambda x: b'\0'*x
|
||||
nulls = lambda x: b'\xff'*x
|
||||
short = lambda x: pack(b'>H', x)
|
||||
|
||||
class Header(OrderedDict):
|
||||
|
||||
HEADER_NAME = b''
|
||||
|
||||
DEFINITION = '''
|
||||
'''
|
||||
|
||||
ALIGN_BLOCK = False
|
||||
POSITIONS = {} # Mapping of position field to field whose position should
|
||||
# be stored in the position field
|
||||
SHORT_FIELDS = set()
|
||||
|
||||
def __init__(self):
|
||||
OrderedDict.__init__(self)
|
||||
|
||||
for line in self.DEFINITION.splitlines():
|
||||
line = line.strip()
|
||||
if not line or line.startswith('#'): continue
|
||||
name, val = [x.strip() for x in line.partition('=')[0::2]]
|
||||
if val:
|
||||
val = eval(val, {'zeroes':zeroes, 'NULL':NULL, 'DYN':None,
|
||||
'nulls':nulls, 'short':short, 'random':random})
|
||||
else:
|
||||
val = 0
|
||||
if name in self:
|
||||
raise ValueError('Duplicate field in definition: %r'%name)
|
||||
self[name] = val
|
||||
|
||||
@property
|
||||
def dynamic_fields(self):
|
||||
return tuple(k for k, v in self.iteritems() if v is None)
|
||||
|
||||
def __call__(self, **kwargs):
|
||||
positions = {}
|
||||
for name, val in kwargs.iteritems():
|
||||
if name not in self:
|
||||
raise KeyError('Not a valid header field: %r'%name)
|
||||
self[name] = val
|
||||
|
||||
buf = BytesIO()
|
||||
buf.write(bytes(self.HEADER_NAME))
|
||||
for name, val in self.iteritems():
|
||||
val = self.format_value(name, val)
|
||||
positions[name] = buf.tell()
|
||||
if val is None:
|
||||
raise ValueError('Dynamic field %r not set'%name)
|
||||
if isinstance(val, (int, long)):
|
||||
fmt = 'H' if name in self.SHORT_FIELDS else 'I'
|
||||
val = pack(b'>'+fmt, val)
|
||||
buf.write(val)
|
||||
|
||||
for pos_field, field in self.POSITIONS.iteritems():
|
||||
buf.seek(positions[pos_field])
|
||||
buf.write(pack(b'>I', positions[field]))
|
||||
|
||||
ans = buf.getvalue()
|
||||
if self.ALIGN_BLOCK:
|
||||
ans = align_block(ans)
|
||||
return ans
|
||||
|
||||
|
||||
def format_value(self, name, val):
|
||||
return val
|
||||
|
||||
|
335
src/calibre/ebooks/mobi/writer8/index.py
Normal file
335
src/calibre/ebooks/mobi/writer8/index.py
Normal file
@ -0,0 +1,335 @@
|
||||
#!/usr/bin/env python
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
from __future__ import (unicode_literals, division, absolute_import,
|
||||
print_function)
|
||||
from future_builtins import map
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
from collections import namedtuple
|
||||
from struct import pack
|
||||
from io import BytesIO
|
||||
|
||||
from calibre.ebooks.mobi.utils import CNCX, encint, align_block
|
||||
from calibre.ebooks.mobi.writer8.header import Header
|
||||
|
||||
TagMeta_ = namedtuple('TagMeta',
|
||||
'name number values_per_entry bitmask end_flag')
|
||||
TagMeta = lambda x:TagMeta_(*x)
|
||||
EndTagTable = TagMeta(('eof', 0, 0, 0, 1))
|
||||
|
||||
# map of mask to number of shifts needed, works with 1 bit and two-bit wide masks
|
||||
# could also be extended to 4 bit wide ones as well
|
||||
mask_to_bit_shifts = { 1:0, 2:1, 3:0, 4:2, 8:3, 12:2, 16:4, 32:5, 48:4, 64:6,
|
||||
128:7, 192: 6 }
|
||||
|
||||
class IndexHeader(Header): # {{{
|
||||
|
||||
HEADER_NAME = b'INDX'
|
||||
ALIGN_BLOCK = True
|
||||
HEADER_LENGTH = 192
|
||||
|
||||
DEFINITION = '''
|
||||
# 4 - 8: Header Length
|
||||
header_length = {header_length}
|
||||
|
||||
# 8 - 16: Unknown
|
||||
unknown1 = zeroes(8)
|
||||
|
||||
# 16 - 20: Index type: 0 - normal 2 - inflection
|
||||
type = 2
|
||||
|
||||
# 20 - 24: IDXT offset (filled in later)
|
||||
idxt_offset
|
||||
|
||||
# 24 - 28: Number of index records
|
||||
num_of_records = 1
|
||||
|
||||
# 28 - 32: Index encoding (65001 = utf-8)
|
||||
encoding = 65001
|
||||
|
||||
# 32 - 36: Unknown
|
||||
unknown2 = NULL
|
||||
|
||||
# 36 - 40: Number of Index entries
|
||||
num_of_entries = DYN
|
||||
|
||||
# 40 - 44: ORDT offset
|
||||
ordt_offset
|
||||
|
||||
# 44 - 48: LIGT offset
|
||||
ligt_offset
|
||||
|
||||
# 48 - 52: Number of ORDT/LIGT? entries
|
||||
num_of_ordt_entries
|
||||
|
||||
# 52 - 56: Number of CNCX records
|
||||
num_of_cncx = DYN
|
||||
|
||||
# 56 - 180: Unknown
|
||||
unknown3 = zeroes(124)
|
||||
|
||||
# 180 - 184: TAGX offset
|
||||
tagx_offset = {header_length}
|
||||
|
||||
# 184 - 192: Unknown
|
||||
unknown4 = zeroes(8)
|
||||
|
||||
# TAGX
|
||||
tagx = DYN
|
||||
|
||||
# Last Index entry
|
||||
last_index = DYN
|
||||
|
||||
# IDXT
|
||||
idxt = DYN
|
||||
'''.format(header_length=HEADER_LENGTH)
|
||||
|
||||
POSITIONS = {'idxt_offset':'idxt'}
|
||||
# }}}
|
||||
|
||||
class Index(object): # {{{
|
||||
|
||||
control_byte_count = 1
|
||||
cncx = CNCX()
|
||||
tag_types = (EndTagTable,)
|
||||
|
||||
HEADER_LENGTH = IndexHeader.HEADER_LENGTH
|
||||
|
||||
@classmethod
|
||||
def generate_tagx(cls):
|
||||
header = b'TAGX'
|
||||
byts = bytearray()
|
||||
for tag_meta in cls.tag_types:
|
||||
byts.extend(tag_meta[1:])
|
||||
# table length, control byte count
|
||||
header += pack(b'>II', 12+len(byts), cls.control_byte_count)
|
||||
return header + bytes(byts)
|
||||
|
||||
@classmethod
|
||||
def calculate_control_bytes_for_each_entry(cls, entries):
|
||||
control_bytes = []
|
||||
for lead_text, tags in entries:
|
||||
cbs = []
|
||||
ans = 0
|
||||
for (name, number, vpe, mask, endi) in cls.tag_types:
|
||||
if endi == 1:
|
||||
cbs.append(ans)
|
||||
ans = 0
|
||||
continue
|
||||
try:
|
||||
nvals = len(tags.get(name, ()))
|
||||
except TypeError:
|
||||
nvals = 1
|
||||
nentries = nvals // vpe
|
||||
shifts = mask_to_bit_shifts[mask]
|
||||
ans |= mask & (nentries << shifts)
|
||||
if len(cbs) != cls.control_byte_count:
|
||||
raise ValueError('The entry %r is invalid'%[lead_text, tags])
|
||||
control_bytes.append(cbs)
|
||||
return control_bytes
|
||||
|
||||
def __call__(self):
|
||||
self.control_bytes = self.calculate_control_bytes_for_each_entry(
|
||||
self.entries)
|
||||
|
||||
rendered_entries = []
|
||||
index, idxt, buf = BytesIO(), BytesIO(), BytesIO()
|
||||
IndexEntry = namedtuple('IndexEntry', 'offset length raw')
|
||||
last_lead_text = b''
|
||||
too_large = ValueError('Index has too many entries, calibre does not'
|
||||
' support generating multiple index records at this'
|
||||
' time.')
|
||||
|
||||
for i, x in enumerate(self.entries):
|
||||
control_bytes = self.control_bytes[i]
|
||||
leading_text, tags = x
|
||||
buf.seek(0), buf.truncate(0)
|
||||
leading_text = (leading_text.encode('utf-8') if
|
||||
isinstance(leading_text, unicode) else leading_text)
|
||||
raw = bytearray(leading_text)
|
||||
raw.insert(0, len(leading_text))
|
||||
buf.write(bytes(raw))
|
||||
buf.write(bytes(bytearray(control_bytes)))
|
||||
for tag in self.tag_types:
|
||||
values = tags.get(tag.name, None)
|
||||
if values is None: continue
|
||||
try:
|
||||
len(values)
|
||||
except TypeError:
|
||||
values = [values]
|
||||
if values:
|
||||
for val in values:
|
||||
try:
|
||||
buf.write(encint(val))
|
||||
except ValueError:
|
||||
raise ValueError('Invalid values for %r: %r'%(
|
||||
tag, values))
|
||||
raw = buf.getvalue()
|
||||
offset = index.tell()
|
||||
if offset + self.HEADER_LENGTH >= 0x10000:
|
||||
raise too_large
|
||||
rendered_entries.append(IndexEntry(offset, len(raw), raw))
|
||||
idxt.write(pack(b'>H', self.HEADER_LENGTH+offset))
|
||||
index.write(raw)
|
||||
last_lead_text = leading_text
|
||||
|
||||
index_block = align_block(index.getvalue())
|
||||
idxt_block = align_block(b'IDXT' + idxt.getvalue())
|
||||
body = index_block + idxt_block
|
||||
if len(body) + self.HEADER_LENGTH >= 0x10000:
|
||||
raise too_large
|
||||
header = b'INDX'
|
||||
buf.seek(0), buf.truncate(0)
|
||||
buf.write(pack(b'>I', self.HEADER_LENGTH))
|
||||
buf.write(b'\0'*4) # Unknown
|
||||
buf.write(pack(b'>I', 1)) # Header type? Or index record number?
|
||||
buf.write(b'\0'*4) # Unknown
|
||||
|
||||
# IDXT block offset
|
||||
buf.write(pack(b'>I', self.HEADER_LENGTH + len(index_block)))
|
||||
|
||||
# Number of index entries
|
||||
buf.write(pack(b'>I', len(rendered_entries)))
|
||||
|
||||
buf.write(b'\xff'*8) # Unknown
|
||||
|
||||
buf.write(b'\0'*156) # Unknown
|
||||
|
||||
header += buf.getvalue()
|
||||
index_record = header + body
|
||||
|
||||
tagx = self.generate_tagx()
|
||||
idxt = (b'IDXT' + pack(b'>H', IndexHeader.HEADER_LENGTH + len(tagx)) +
|
||||
b'\0')
|
||||
# Last index
|
||||
idx = bytes(bytearray([len(last_lead_text)])) + last_lead_text
|
||||
idx += pack(b'>H', len(rendered_entries))
|
||||
|
||||
header = {
|
||||
'num_of_entries': len(rendered_entries),
|
||||
'num_of_cncx': len(self.cncx),
|
||||
'tagx':tagx,
|
||||
'last_index':align_block(idx),
|
||||
'idxt':idxt
|
||||
}
|
||||
header = IndexHeader()(**header)
|
||||
self.records = [header, index_record]
|
||||
self.records.extend(self.cncx.records)
|
||||
return self.records
|
||||
# }}}
|
||||
|
||||
class SkelIndex(Index):
|
||||
|
||||
tag_types = tuple(map(TagMeta, (
|
||||
('chunk_count', 1, 1, 3, 0),
|
||||
('geometry', 6, 2, 12, 0),
|
||||
EndTagTable
|
||||
)))
|
||||
|
||||
def __init__(self, skel_table):
|
||||
self.entries = [
|
||||
(s.name, {
|
||||
# Dont ask me why these entries have to be repeated twice
|
||||
'chunk_count':(s.chunk_count, s.chunk_count),
|
||||
'geometry':(s.start_pos, s.length, s.start_pos, s.length),
|
||||
}) for s in skel_table
|
||||
]
|
||||
|
||||
|
||||
class ChunkIndex(Index):
|
||||
|
||||
tag_types = tuple(map(TagMeta, (
|
||||
('cncx_offset', 2, 1, 1, 0),
|
||||
('file_number', 3, 1, 2, 0),
|
||||
('sequence_number', 4, 1, 4, 0),
|
||||
('geometry', 6, 2, 8, 0),
|
||||
EndTagTable
|
||||
)))
|
||||
|
||||
def __init__(self, chunk_table):
|
||||
self.cncx = CNCX(c.selector for c in chunk_table)
|
||||
|
||||
self.entries = [
|
||||
('%010d'%c.insert_pos, {
|
||||
|
||||
'cncx_offset':self.cncx[c.selector],
|
||||
'file_number':c.file_number,
|
||||
'sequence_number':c.sequence_number,
|
||||
'geometry':(c.start_pos, c.length),
|
||||
}) for c in chunk_table
|
||||
]
|
||||
|
||||
class GuideIndex(Index):
|
||||
|
||||
tag_types = tuple(map(TagMeta, (
|
||||
('title', 1, 1, 1, 0),
|
||||
('pos_fid', 6, 2, 2, 0),
|
||||
EndTagTable
|
||||
)))
|
||||
|
||||
def __init__(self, guide_table):
|
||||
self.cncx = CNCX(c.title for c in guide_table)
|
||||
|
||||
self.entries = [
|
||||
(r.type, {
|
||||
|
||||
'title':self.cncx[r.title],
|
||||
'pos_fid':r.pos_fid,
|
||||
}) for r in guide_table
|
||||
]
|
||||
|
||||
|
||||
class NCXIndex(Index):
|
||||
|
||||
''' The commented out parts have been seen in NCX indexes from MOBI 6
|
||||
periodicals. Since we have no MOBI 8 periodicals to reverse engineer, leave
|
||||
it for now. '''
|
||||
# control_byte_count = 2
|
||||
tag_types = tuple(map(TagMeta, (
|
||||
('offset', 1, 1, 1, 0),
|
||||
('length', 2, 1, 2, 0),
|
||||
('label', 3, 1, 4, 0),
|
||||
('depth', 4, 1, 8, 0),
|
||||
('parent', 21, 1, 16, 0),
|
||||
('first_child', 22, 1, 32, 0),
|
||||
('last_child', 23, 1, 64, 0),
|
||||
('pos_fid', 6, 2, 128, 0),
|
||||
EndTagTable,
|
||||
# ('image', 69, 1, 1, 0),
|
||||
# ('description', 70, 1, 2, 0),
|
||||
# ('author', 71, 1, 4, 0),
|
||||
# ('caption', 72, 1, 8, 0),
|
||||
# ('attribution', 73, 1, 16, 0),
|
||||
# EndTagTable
|
||||
)))
|
||||
|
||||
def __init__(self, toc_table):
|
||||
strings = []
|
||||
for entry in toc_table:
|
||||
strings.append(entry['label'])
|
||||
aut = entry.get('author', None)
|
||||
if aut:
|
||||
strings.append(aut)
|
||||
desc = entry.get('description', None)
|
||||
if desc:
|
||||
strings.append(desc)
|
||||
self.cncx = CNCX(strings)
|
||||
|
||||
def to_entry(x):
|
||||
ans = {}
|
||||
for f in ('offset', 'length', 'depth', 'pos_fid', 'parent',
|
||||
'first_child', 'last_child'):
|
||||
if f in x:
|
||||
ans[f] = x[f]
|
||||
for f in ('label', 'description', 'author'):
|
||||
if f in x:
|
||||
ans[f] = self.cncx[x[f]]
|
||||
return ('%02x'%x['index'], ans)
|
||||
|
||||
self.entries = list(map(to_entry, toc_table))
|
||||
|
||||
|
||||
|
406
src/calibre/ebooks/mobi/writer8/main.py
Normal file
406
src/calibre/ebooks/mobi/writer8/main.py
Normal file
@ -0,0 +1,406 @@
|
||||
#!/usr/bin/env python
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
from __future__ import (unicode_literals, division, absolute_import,
|
||||
print_function)
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import copy, logging
|
||||
from functools import partial
|
||||
from collections import defaultdict, namedtuple
|
||||
from io import BytesIO
|
||||
from struct import pack
|
||||
|
||||
import cssutils
|
||||
from lxml import etree
|
||||
|
||||
from calibre import isbytestring, force_unicode
|
||||
from calibre.ebooks.mobi.utils import (create_text_record, to_base,
|
||||
is_guide_ref_start)
|
||||
from calibre.ebooks.compression.palmdoc import compress_doc
|
||||
from calibre.ebooks.oeb.base import (OEB_DOCS, OEB_STYLES, SVG_MIME, XPath,
|
||||
extract, XHTML, urlnormalize)
|
||||
from calibre.ebooks.oeb.parse_utils import barename
|
||||
from calibre.ebooks.mobi.writer8.skeleton import Chunker, aid_able_tags, to_href
|
||||
from calibre.ebooks.mobi.writer8.index import (NCXIndex, SkelIndex,
|
||||
ChunkIndex, GuideIndex)
|
||||
from calibre.ebooks.mobi.writer8.mobi import KF8Book
|
||||
from calibre.ebooks.mobi.writer8.tbs import apply_trailing_byte_sequences
|
||||
from calibre.ebooks.mobi.writer8.toc import TOCAdder
|
||||
|
||||
XML_DOCS = OEB_DOCS | {SVG_MIME}
|
||||
|
||||
# References to record numbers in KF8 are stored as base-32 encoded integers,
|
||||
# with 4 digits
|
||||
to_ref = partial(to_base, base=32, min_num_digits=4)
|
||||
|
||||
class KF8Writer(object):
|
||||
|
||||
def __init__(self, oeb, opts, resources):
|
||||
self.oeb, self.opts, self.log = oeb, opts, oeb.log
|
||||
self.compress = not self.opts.dont_compress
|
||||
self.has_tbs = False
|
||||
self.log.info('Creating KF8 output')
|
||||
|
||||
# Create an inline ToC if one does not already exist
|
||||
self.toc_adder = TOCAdder(oeb, opts)
|
||||
self.used_images = set()
|
||||
self.resources = resources
|
||||
self.flows = [None] # First flow item is reserved for the text
|
||||
self.records = [None] # Placeholder for zeroth record
|
||||
|
||||
self.log('\tGenerating KF8 markup...')
|
||||
self.dup_data()
|
||||
self.replace_resource_links()
|
||||
self.extract_css_into_flows()
|
||||
self.extract_svg_into_flows()
|
||||
self.replace_internal_links_with_placeholders()
|
||||
self.insert_aid_attributes()
|
||||
self.chunk_it_up()
|
||||
# Dump the cloned data as it is no longer needed
|
||||
del self._data_cache
|
||||
self.create_text_records()
|
||||
self.log('\tCreating indices...')
|
||||
self.create_fdst_records()
|
||||
self.create_indices()
|
||||
self.create_guide()
|
||||
# We do not want to use this ToC for MOBI 6, so remove it
|
||||
self.toc_adder.remove_generated_toc()
|
||||
|
||||
def dup_data(self):
|
||||
''' Duplicate data so that any changes we make to markup/CSS only
|
||||
affect KF8 output and not MOBI 6 output '''
|
||||
self._data_cache = {}
|
||||
# Suppress cssutils logging output as it is duplicated anyway earlier
|
||||
# in the pipeline
|
||||
cssutils.log.setLevel(logging.CRITICAL)
|
||||
for item in self.oeb.manifest:
|
||||
if item.media_type in XML_DOCS:
|
||||
self._data_cache[item.href] = copy.deepcopy(item.data)
|
||||
elif item.media_type in OEB_STYLES:
|
||||
# I can't figure out how to make an efficient copy of the
|
||||
# in-memory CSSStylesheet, as deepcopy doesn't work (raises an
|
||||
# exception)
|
||||
self._data_cache[item.href] = cssutils.parseString(
|
||||
item.data.cssText, validate=False)
|
||||
|
||||
def data(self, item):
|
||||
return self._data_cache.get(item.href, item.data)
|
||||
|
||||
def replace_resource_links(self):
|
||||
''' Replace links to resources (raster images/fonts) with pointers to
|
||||
the MOBI record containing the resource. The pointers are of the form:
|
||||
kindle:embed:XXXX?mime=image/* The ?mime= is apparently optional and
|
||||
not used for fonts. '''
|
||||
|
||||
def pointer(item, oref):
|
||||
ref = item.abshref(oref)
|
||||
idx = self.resources.item_map.get(ref, None)
|
||||
if idx is not None:
|
||||
is_image = self.resources.records[idx-1][:4] not in {b'FONT'}
|
||||
idx = to_ref(idx)
|
||||
if is_image:
|
||||
self.used_images.add(ref)
|
||||
return 'kindle:embed:%s?mime=%s'%(idx,
|
||||
self.resources.mime_map[ref])
|
||||
else:
|
||||
return 'kindle:embed:%s'%idx
|
||||
return oref
|
||||
|
||||
for item in self.oeb.manifest:
|
||||
|
||||
if item.media_type in XML_DOCS:
|
||||
root = self.data(item)
|
||||
for tag in XPath('//h:img|//svg:image')(root):
|
||||
for attr, ref in tag.attrib.iteritems():
|
||||
if attr.split('}')[-1].lower() in {'src', 'href'}:
|
||||
tag.attrib[attr] = pointer(item, ref)
|
||||
|
||||
for tag in XPath('//h:style')(root):
|
||||
if tag.text:
|
||||
sheet = cssutils.parseString(tag.text, validate=False)
|
||||
replacer = partial(pointer, item)
|
||||
cssutils.replaceUrls(sheet, replacer,
|
||||
ignoreImportRules=True)
|
||||
repl = sheet.cssText
|
||||
if isbytestring(repl):
|
||||
repl = repl.decode('utf-8')
|
||||
tag.text = '\n'+ repl + '\n'
|
||||
|
||||
elif item.media_type in OEB_STYLES:
|
||||
sheet = self.data(item)
|
||||
replacer = partial(pointer, item)
|
||||
cssutils.replaceUrls(sheet, replacer, ignoreImportRules=True)
|
||||
|
||||
def extract_css_into_flows(self):
|
||||
inlines = defaultdict(list) # Ensure identical <style>s not repeated
|
||||
sheets = {}
|
||||
|
||||
for item in self.oeb.manifest:
|
||||
if item.media_type in OEB_STYLES:
|
||||
data = self.data(item).cssText
|
||||
sheets[item.href] = len(self.flows)
|
||||
self.flows.append(force_unicode(data, 'utf-8'))
|
||||
|
||||
for item in self.oeb.spine:
|
||||
root = self.data(item)
|
||||
|
||||
for link in XPath('//h:link[@href]')(root):
|
||||
href = item.abshref(link.get('href'))
|
||||
idx = sheets.get(href, None)
|
||||
if idx is not None:
|
||||
idx = to_ref(idx)
|
||||
link.set('href', 'kindle:flow:%s?mime=text/css'%idx)
|
||||
|
||||
for tag in XPath('//h:style')(root):
|
||||
p = tag.getparent()
|
||||
idx = p.index(tag)
|
||||
raw = tag.text
|
||||
if not raw or not raw.strip():
|
||||
extract(tag)
|
||||
continue
|
||||
repl = etree.Element(XHTML('link'), type='text/css',
|
||||
rel='stylesheet')
|
||||
repl.tail='\n'
|
||||
p.insert(idx, repl)
|
||||
extract(tag)
|
||||
inlines[raw].append(repl)
|
||||
|
||||
for raw, elems in inlines.iteritems():
|
||||
idx = to_ref(len(self.flows))
|
||||
self.flows.append(raw)
|
||||
for link in elems:
|
||||
link.set('href', 'kindle:flow:%s?mime=text/css'%idx)
|
||||
|
||||
def extract_svg_into_flows(self):
|
||||
images = {}
|
||||
|
||||
for item in self.oeb.manifest:
|
||||
if item.media_type == SVG_MIME:
|
||||
data = self.data(item)
|
||||
images[item.href] = len(self.flows)
|
||||
self.flows.append(etree.tostring(data, encoding='UTF-8',
|
||||
with_tail=True, xml_declaration=True))
|
||||
|
||||
for item in self.oeb.spine:
|
||||
root = self.data(item)
|
||||
|
||||
for svg in XPath('//svg:svg')(root):
|
||||
raw = etree.tostring(svg, encoding=unicode, with_tail=False)
|
||||
idx = len(self.flows)
|
||||
self.flows.append(raw)
|
||||
p = svg.getparent()
|
||||
pos = p.index(svg)
|
||||
img = etree.Element(XHTML('img'),
|
||||
src="kindle:flow:%s?mime=image/svg+xml"%to_ref(idx))
|
||||
p.insert(pos, img)
|
||||
extract(svg)
|
||||
|
||||
for img in XPath('//h:img[@src]')(root):
|
||||
src = img.get('src')
|
||||
abshref = item.abshref(src)
|
||||
idx = images.get(abshref, None)
|
||||
if idx is not None:
|
||||
img.set('src', 'kindle:flow:%s?mime=image/svg+xml'%
|
||||
to_ref(idx))
|
||||
|
||||
def replace_internal_links_with_placeholders(self):
|
||||
self.link_map = {}
|
||||
count = 0
|
||||
hrefs = {item.href for item in self.oeb.spine}
|
||||
for item in self.oeb.spine:
|
||||
root = self.data(item)
|
||||
|
||||
for a in XPath('//h:a[@href]')(root):
|
||||
count += 1
|
||||
ref = item.abshref(a.get('href'))
|
||||
href, _, frag = ref.partition('#')
|
||||
href = urlnormalize(href)
|
||||
if href in hrefs:
|
||||
placeholder = 'kindle:pos:fid:0000:off:%s'%to_href(count)
|
||||
self.link_map[placeholder] = (href, frag)
|
||||
a.set('href', placeholder)
|
||||
|
||||
def insert_aid_attributes(self):
|
||||
self.id_map = {}
|
||||
for i, item in enumerate(self.oeb.spine):
|
||||
root = self.data(item)
|
||||
aidbase = i * int(1e6)
|
||||
j = 0
|
||||
for tag in root.iterdescendants(etree.Element):
|
||||
id_ = tag.attrib.get('id', None)
|
||||
if id_ is not None or barename(tag.tag).lower() in aid_able_tags:
|
||||
aid = aidbase + j
|
||||
tag.attrib['aid'] = to_base(aid, base=32)
|
||||
if tag.tag == XHTML('body'):
|
||||
self.id_map[(item.href, '')] = tag.attrib['aid']
|
||||
if id_ is not None:
|
||||
self.id_map[(item.href, id_)] = tag.attrib['aid']
|
||||
|
||||
j += 1
|
||||
|
||||
def chunk_it_up(self):
|
||||
placeholder_map = {}
|
||||
for placeholder, x in self.link_map.iteritems():
|
||||
href, frag = x
|
||||
aid = self.id_map.get(x, None)
|
||||
if aid is None:
|
||||
aid = self.id_map.get((href, ''))
|
||||
placeholder_map[placeholder] = aid
|
||||
chunker = Chunker(self.oeb, self.data, placeholder_map)
|
||||
|
||||
for x in ('skel_table', 'chunk_table', 'aid_offset_map'):
|
||||
setattr(self, x, getattr(chunker, x))
|
||||
|
||||
self.flows[0] = chunker.text
|
||||
|
||||
def create_text_records(self):
|
||||
self.flows = [x.encode('utf-8') if isinstance(x, unicode) else x for x
|
||||
in self.flows]
|
||||
text = b''.join(self.flows)
|
||||
self.text_length = len(text)
|
||||
text = BytesIO(text)
|
||||
nrecords = 0
|
||||
records_size = 0
|
||||
|
||||
if self.compress:
|
||||
self.oeb.logger.info('\tCompressing markup...')
|
||||
|
||||
while text.tell() < self.text_length:
|
||||
data, overlap = create_text_record(text)
|
||||
if self.compress:
|
||||
data = compress_doc(data)
|
||||
|
||||
data += overlap
|
||||
data += pack(b'>B', len(overlap))
|
||||
|
||||
self.records.append(data)
|
||||
records_size += len(data)
|
||||
nrecords += 1
|
||||
|
||||
self.last_text_record_idx = nrecords
|
||||
self.first_non_text_record_idx = nrecords + 1
|
||||
# Pad so that the next records starts at a 4 byte boundary
|
||||
if records_size % 4 != 0:
|
||||
self.records.append(b'\x00'*(records_size % 4))
|
||||
self.first_non_text_record_idx += 1
|
||||
|
||||
def create_fdst_records(self):
|
||||
FDST = namedtuple('Flow', 'start end')
|
||||
entries = []
|
||||
self.fdst_table = []
|
||||
for i, flow in enumerate(self.flows):
|
||||
start = 0 if i == 0 else self.fdst_table[-1].end
|
||||
self.fdst_table.append(FDST(start, start + len(flow)))
|
||||
entries.extend(self.fdst_table[-1])
|
||||
rec = (b'FDST' + pack(b'>LL', 12, len(self.fdst_table)) +
|
||||
pack(b'>%dL'%len(entries), *entries))
|
||||
self.fdst_records = [rec]
|
||||
self.fdst_count = len(self.fdst_table)
|
||||
|
||||
def create_indices(self):
|
||||
self.skel_records = SkelIndex(self.skel_table)()
|
||||
self.chunk_records = ChunkIndex(self.chunk_table)()
|
||||
self.ncx_records = []
|
||||
toc = self.oeb.toc
|
||||
entries = []
|
||||
is_periodical = self.opts.mobi_periodical
|
||||
if toc.count() < 2:
|
||||
self.log.warn('Document has no ToC, MOBI will have no NCX index')
|
||||
return
|
||||
|
||||
# Flatten the ToC into a depth first list
|
||||
fl = toc.iter() if is_periodical else toc.iterdescendants()
|
||||
for i, item in enumerate(fl):
|
||||
entry = {'id': id(item), 'index': i, 'href':item.href,
|
||||
'label':(item.title or _('Unknown')),
|
||||
'children':[]}
|
||||
entry['depth'] = getattr(item, 'ncx_hlvl', 0)
|
||||
p = getattr(item, 'ncx_parent', None)
|
||||
if p is not None:
|
||||
entry['parent_id'] = p
|
||||
for child in item:
|
||||
child.ncx_parent = entry['id']
|
||||
child.ncx_hlvl = entry['depth'] + 1
|
||||
entry['children'].append(id(child))
|
||||
if is_periodical:
|
||||
if item.author:
|
||||
entry['author'] = item.author
|
||||
if item.description:
|
||||
entry['description'] = item.description
|
||||
entries.append(entry)
|
||||
|
||||
# The Kindle requires entries to be sorted by (depth, playorder)
|
||||
entries.sort(key=lambda entry: (entry['depth'], entry['index']))
|
||||
for i, entry in enumerate(entries):
|
||||
entry['index'] = i
|
||||
id_to_index = {entry['id']:entry['index'] for entry in entries}
|
||||
|
||||
# Write the hierarchical and start offset information
|
||||
for entry in entries:
|
||||
children = entry.pop('children')
|
||||
if children:
|
||||
entry['first_child'] = id_to_index[children[0]]
|
||||
entry['last_child'] = id_to_index[children[-1]]
|
||||
if 'parent_id' in entry:
|
||||
entry['parent'] = id_to_index[entry.pop('parent_id')]
|
||||
href = entry.pop('href')
|
||||
href, frag = href.partition('#')[0::2]
|
||||
aid = self.id_map.get((href, frag), None)
|
||||
if aid is None:
|
||||
aid = self.id_map.get((href, ''), None)
|
||||
if aid is None:
|
||||
pos, fid = 0, 0
|
||||
else:
|
||||
pos, fid = self.aid_offset_map[aid]
|
||||
chunk = self.chunk_table[pos]
|
||||
offset = chunk.insert_pos + fid
|
||||
entry['pos_fid'] = (pos, fid)
|
||||
entry['offset'] = offset
|
||||
|
||||
# Write the lengths
|
||||
def get_next_start(entry):
|
||||
enders = [e['offset'] for e in entries if e['depth'] <=
|
||||
entry['depth'] and e['offset'] > entry['offset']]
|
||||
if enders:
|
||||
return min(enders)
|
||||
return len(self.flows[0])
|
||||
|
||||
for entry in entries:
|
||||
entry['length'] = get_next_start(entry) - entry['offset']
|
||||
|
||||
self.has_tbs = apply_trailing_byte_sequences(entries, self.records,
|
||||
self.last_text_record_idx+1)
|
||||
self.ncx_records = NCXIndex(entries)()
|
||||
|
||||
def create_guide(self):
|
||||
self.start_offset = None
|
||||
self.guide_table = []
|
||||
self.guide_records = []
|
||||
GuideRef = namedtuple('GuideRef', 'title type pos_fid')
|
||||
for ref in self.oeb.guide.values():
|
||||
href, frag = ref.href.partition('#')[0::2]
|
||||
aid = self.id_map.get((href, frag), None)
|
||||
if aid is None:
|
||||
aid = self.id_map.get((href, ''))
|
||||
if aid is None:
|
||||
continue
|
||||
pos, fid = self.aid_offset_map[aid]
|
||||
if is_guide_ref_start(ref):
|
||||
chunk = self.chunk_table[pos]
|
||||
skel = [s for s in self.skel_table if s.file_number ==
|
||||
chunk.file_number][0]
|
||||
self.start_offset = skel.start_pos + skel.length + chunk.start_pos + fid
|
||||
self.guide_table.append(GuideRef(ref.title or
|
||||
_('Unknown'), ref.type, (pos, fid)))
|
||||
|
||||
if self.guide_table:
|
||||
self.guide_table.sort(key=lambda x:x.type) # Needed by the Kindle
|
||||
self.guide_records = GuideIndex(self.guide_table)()
|
||||
|
||||
def create_kf8_book(oeb, opts, resources, for_joint=False):
|
||||
writer = KF8Writer(oeb, opts, resources)
|
||||
return KF8Book(writer, for_joint=for_joint)
|
||||
|
311
src/calibre/ebooks/mobi/writer8/mobi.py
Normal file
311
src/calibre/ebooks/mobi/writer8/mobi.py
Normal file
@ -0,0 +1,311 @@
|
||||
#!/usr/bin/env python
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
from __future__ import (unicode_literals, division, absolute_import,
|
||||
print_function)
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import time, random
|
||||
from struct import pack
|
||||
|
||||
from calibre.ebooks.mobi.utils import RECORD_SIZE, utf8_text
|
||||
from calibre.ebooks.mobi.writer8.header import Header
|
||||
from calibre.ebooks.mobi.writer2 import (PALMDOC, UNCOMPRESSED)
|
||||
from calibre.ebooks.mobi.langcodes import iana2mobi
|
||||
from calibre.ebooks.mobi.writer8.exth import build_exth
|
||||
from calibre.utils.filenames import ascii_filename
|
||||
|
||||
NULL_INDEX = 0xffffffff
|
||||
|
||||
class MOBIHeader(Header): # {{{
|
||||
'''
|
||||
Represents the first record in a MOBI file, contains all the metadata about
|
||||
the file.
|
||||
'''
|
||||
|
||||
DEFINITION = '''
|
||||
# 0: Compression
|
||||
compression = DYN
|
||||
|
||||
# 2: Unused
|
||||
unused1 = zeroes(2)
|
||||
|
||||
# 4: Text length
|
||||
text_length = DYN
|
||||
|
||||
# 8: Last text record
|
||||
last_text_record = DYN
|
||||
|
||||
# 10: Text record size
|
||||
record_size = {record_size}
|
||||
|
||||
# 12: Encryption Type
|
||||
encryption_type
|
||||
|
||||
# 14: Unused
|
||||
unused2
|
||||
|
||||
# 16: Ident
|
||||
ident = b'MOBI'
|
||||
|
||||
# 20: Header length
|
||||
header_length = 248
|
||||
|
||||
# 24: Book Type (0x2 - Book, 0x101 - News hierarchical, 0x102 - News
|
||||
# (flat), 0x103 - News magazine same as 0x101)
|
||||
book_type = DYN
|
||||
|
||||
# 28: Text encoding (utf-8 = 65001)
|
||||
encoding = 65001
|
||||
|
||||
# 32: UID
|
||||
uid = DYN
|
||||
|
||||
# 36: File version
|
||||
file_version = {file_version}
|
||||
|
||||
# 40: Meta orth record (used in dictionaries)
|
||||
meta_orth_record = NULL
|
||||
|
||||
# 44: Meta infl index
|
||||
meta_infl_index = NULL
|
||||
|
||||
# 48: Extra indices
|
||||
extra_index0 = NULL
|
||||
extra_index1 = NULL
|
||||
extra_index2 = NULL
|
||||
extra_index3 = NULL
|
||||
extra_index4 = NULL
|
||||
extra_index5 = NULL
|
||||
extra_index6 = NULL
|
||||
extra_index7 = NULL
|
||||
|
||||
# 80: First non text record
|
||||
first_non_text_record = DYN
|
||||
|
||||
# 84: Title offset
|
||||
title_offset
|
||||
|
||||
# 88: Title Length
|
||||
title_length = DYN
|
||||
|
||||
# 92: Language code
|
||||
language_code = DYN
|
||||
|
||||
# 96: Dictionary in and out languages
|
||||
in_lang
|
||||
out_lang
|
||||
|
||||
# 104: Min version
|
||||
min_version = {file_version}
|
||||
|
||||
# 108: First resource record
|
||||
first_resource_record = DYN
|
||||
|
||||
# 112: Huff/CDIC compression
|
||||
huff_first_record
|
||||
huff_count
|
||||
|
||||
# 120: Unknown (Maybe DATP related, maybe HUFF/CDIC related)
|
||||
maybe_datp = zeroes(8)
|
||||
|
||||
# 128: EXTH flags
|
||||
exth_flags = DYN
|
||||
|
||||
# 132: Unknown
|
||||
unknown = zeroes(36)
|
||||
|
||||
# 168: DRM
|
||||
drm_offset = NULL
|
||||
drm_count
|
||||
drm_size
|
||||
drm_flags
|
||||
|
||||
# 184: Unknown
|
||||
unknown2 = zeroes(8)
|
||||
|
||||
# 192: FDST
|
||||
fdst_record = DYN
|
||||
fdst_count = DYN
|
||||
|
||||
# 200: FCI
|
||||
fcis_record = NULL
|
||||
fcis_count
|
||||
|
||||
# 208: FLIS
|
||||
flis_record = NULL
|
||||
flis_count
|
||||
|
||||
# 216: Unknown
|
||||
unknown3 = zeroes(8)
|
||||
|
||||
# 224: SRCS
|
||||
srcs_record = NULL
|
||||
srcs_count
|
||||
|
||||
# 232: Unknown
|
||||
unknown4 = nulls(8)
|
||||
|
||||
# 240: Extra data flags
|
||||
# 0b1 - extra multibyte bytes after text records
|
||||
# 0b10 - TBS indexing data (only used in MOBI 6)
|
||||
# 0b100 - uncrossable breaks only used in MOBI 6
|
||||
extra_data_flags = DYN
|
||||
|
||||
# 244: KF8 Indices
|
||||
ncx_index = DYN
|
||||
chunk_index = DYN
|
||||
skel_index = DYN
|
||||
datp_index = NULL
|
||||
guide_index = DYN
|
||||
|
||||
# 264: EXTH
|
||||
exth = DYN
|
||||
|
||||
# Full title
|
||||
full_title = DYN
|
||||
|
||||
# Padding to allow amazon's DTP service to add data
|
||||
padding = zeroes(8192)
|
||||
'''
|
||||
|
||||
SHORT_FIELDS = {'compression', 'last_text_record', 'record_size',
|
||||
'encryption_type', 'unused2'}
|
||||
ALIGN = True
|
||||
POSITIONS = {'title_offset':'full_title'}
|
||||
|
||||
def __init__(self, file_version=8):
|
||||
self.DEFINITION = self.DEFINITION.format(file_version=file_version,
|
||||
record_size=RECORD_SIZE)
|
||||
super(MOBIHeader, self).__init__()
|
||||
|
||||
def format_value(self, name, val):
|
||||
if name == 'compression':
|
||||
val = PALMDOC if val else UNCOMPRESSED
|
||||
return super(MOBIHeader, self).format_value(name, val)
|
||||
|
||||
# }}}
|
||||
|
||||
HEADER_FIELDS = {'compression', 'text_length', 'last_text_record', 'book_type',
|
||||
'first_non_text_record', 'title_length', 'language_code',
|
||||
'first_resource_record', 'exth_flags', 'fdst_record',
|
||||
'fdst_count', 'ncx_index', 'chunk_index', 'skel_index',
|
||||
'guide_index', 'exth', 'full_title', 'extra_data_flags',
|
||||
'uid'}
|
||||
|
||||
class KF8Book(object):
|
||||
|
||||
def __init__(self, writer, for_joint=False):
|
||||
self.build_records(writer, for_joint)
|
||||
self.used_images = writer.used_images
|
||||
|
||||
def build_records(self, writer, for_joint):
|
||||
metadata = writer.oeb.metadata
|
||||
# The text records
|
||||
for x in ('last_text_record_idx', 'first_non_text_record_idx'):
|
||||
setattr(self, x.rpartition('_')[0], getattr(writer, x))
|
||||
self.records = writer.records
|
||||
self.text_length = writer.text_length
|
||||
|
||||
# KF8 Indices
|
||||
self.chunk_index = len(self.records)
|
||||
self.records.extend(writer.chunk_records)
|
||||
self.skel_index = len(self.records)
|
||||
self.records.extend(writer.skel_records)
|
||||
self.guide_index = NULL_INDEX
|
||||
if writer.guide_records:
|
||||
self.guide_index = len(self.records)
|
||||
self.records.extend(writer.guide_records)
|
||||
self.ncx_index = NULL_INDEX
|
||||
if writer.ncx_records:
|
||||
self.ncx_index = len(self.records)
|
||||
self.records.extend(writer.ncx_records)
|
||||
|
||||
# Resources
|
||||
resources = writer.resources
|
||||
for x in ('cover_offset', 'thumbnail_offset', 'masthead_offset'):
|
||||
setattr(self, x, getattr(resources, x))
|
||||
|
||||
self.first_resource_record = NULL_INDEX
|
||||
before = len(self.records)
|
||||
if resources.records:
|
||||
self.first_resource_record = len(self.records)
|
||||
if not for_joint:
|
||||
resources.serialize(self.records, writer.used_images)
|
||||
self.num_of_resources = len(self.records) - before
|
||||
|
||||
# FDST
|
||||
self.fdst_count = writer.fdst_count
|
||||
self.fdst_record = len(self.records)
|
||||
self.records.extend(writer.fdst_records)
|
||||
|
||||
# EOF
|
||||
self.records.append(b'\xe9\x8e\r\n') # EOF record
|
||||
|
||||
# Miscellaneous header fields
|
||||
self.compression = writer.compress
|
||||
self.book_type = 0x101 if writer.opts.mobi_periodical else 2
|
||||
self.full_title = utf8_text(unicode(metadata.title[0]))
|
||||
self.title_length = len(self.full_title)
|
||||
self.extra_data_flags = 0b1
|
||||
if writer.has_tbs:
|
||||
self.extra_data_flags |= 0b10
|
||||
self.uid = random.randint(0, 0xffffffff)
|
||||
|
||||
self.language_code = iana2mobi(str(metadata.language[0]))
|
||||
self.exth_flags = 0b1010000
|
||||
if writer.opts.mobi_periodical:
|
||||
self.exth_flags |= 0b1000
|
||||
|
||||
self.opts = writer.opts
|
||||
self.start_offset = writer.start_offset
|
||||
self.metadata = metadata
|
||||
self.kuc = 0 if len(resources.records) > 0 else None
|
||||
|
||||
@property
|
||||
def record0(self):
|
||||
''' We generate the EXTH header and record0 dynamically, to allow other
|
||||
code to customize various values after build_records() has been
|
||||
called'''
|
||||
opts = self.opts
|
||||
self.exth = build_exth(self.metadata,
|
||||
prefer_author_sort=opts.prefer_author_sort,
|
||||
is_periodical=opts.mobi_periodical,
|
||||
share_not_sync=opts.share_not_sync,
|
||||
cover_offset=self.cover_offset,
|
||||
thumbnail_offset=self.thumbnail_offset,
|
||||
num_of_resources=self.num_of_resources,
|
||||
kf8_unknown_count=self.kuc, be_kindlegen2=True,
|
||||
start_offset=self.start_offset, mobi_doctype=self.book_type)
|
||||
|
||||
kwargs = {field:getattr(self, field) for field in HEADER_FIELDS}
|
||||
return MOBIHeader()(**kwargs)
|
||||
|
||||
def write(self, outpath):
|
||||
records = [self.record0] + self.records[1:]
|
||||
|
||||
with open(outpath, 'wb') as f:
|
||||
|
||||
# Write PalmDB Header
|
||||
|
||||
title = ascii_filename(self.full_title.decode('utf-8')).replace(
|
||||
' ', '_')[:31]
|
||||
title += (b'\0' * (32 - len(title)))
|
||||
now = int(time.time())
|
||||
nrecords = len(records)
|
||||
f.write(title)
|
||||
f.write(pack(b'>HHIIIIII', 0, 0, now, now, 0, 0, 0, 0))
|
||||
f.write(b'BOOKMOBI')
|
||||
f.write(pack(b'>IIH', (2*nrecords)-1, 0, nrecords))
|
||||
offset = f.tell() + (8 * nrecords) + 2
|
||||
for i, record in enumerate(records):
|
||||
f.write(pack(b'>I', offset))
|
||||
f.write(b'\0' + pack(b'>I', 2*i)[1:])
|
||||
offset += len(record)
|
||||
f.write(b'\0\0')
|
||||
|
||||
for rec in records:
|
||||
f.write(rec)
|
||||
|
417
src/calibre/ebooks/mobi/writer8/skeleton.py
Normal file
417
src/calibre/ebooks/mobi/writer8/skeleton.py
Normal file
@ -0,0 +1,417 @@
|
||||
#!/usr/bin/env python
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
from __future__ import (unicode_literals, division, absolute_import,
|
||||
print_function)
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import re
|
||||
from collections import namedtuple
|
||||
from functools import partial
|
||||
|
||||
from lxml import etree
|
||||
|
||||
from calibre.ebooks.oeb.base import XHTML_NS
|
||||
from calibre.constants import ispy3
|
||||
from calibre.ebooks.mobi.utils import to_base
|
||||
|
||||
CHUNK_SIZE = 8192
|
||||
|
||||
# References in links are stored with 10 digits
|
||||
to_href = partial(to_base, base=32, min_num_digits=10)
|
||||
|
||||
# Tags to which kindlegen adds the aid attribute
|
||||
aid_able_tags = {'a', 'abbr', 'address', 'article', 'aside', 'audio', 'b',
|
||||
'bdo', 'blockquote', 'body', 'button', 'cite', 'code', 'dd', 'del', 'details',
|
||||
'dfn', 'div', 'dl', 'dt', 'em', 'fieldset', 'figcaption', 'figure', 'footer',
|
||||
'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'header', 'hgroup', 'i', 'ins', 'kbd',
|
||||
'label', 'legend', 'li', 'map', 'mark', 'meter', 'nav', 'ol', 'output', 'p',
|
||||
'pre', 'progress', 'q', 'rp', 'rt', 'samp', 'section', 'select', 'small',
|
||||
'span', 'strong', 'sub', 'summary', 'sup', 'textarea', 'time', 'ul', 'var',
|
||||
'video'}
|
||||
|
||||
_self_closing_pat = re.compile(bytes(
|
||||
r'<(?P<tag>%s)(?=[\s/])(?P<arg>[^>]*)/>'%('|'.join(aid_able_tags))),
|
||||
re.IGNORECASE)
|
||||
|
||||
def close_self_closing_tags(raw):
|
||||
return _self_closing_pat.sub(br'<\g<tag>\g<arg>></\g<tag>>', raw)
|
||||
|
||||
def path_to_node(node):
|
||||
ans = []
|
||||
parent = node.getparent()
|
||||
while parent is not None:
|
||||
ans.append(parent.index(node))
|
||||
node = parent
|
||||
parent = parent.getparent()
|
||||
return tuple(reversed(ans))
|
||||
|
||||
def node_from_path(root, path):
|
||||
parent = root
|
||||
for idx in path:
|
||||
parent = parent[idx]
|
||||
return parent
|
||||
|
||||
mychr = chr if ispy3 else unichr
|
||||
|
||||
def tostring(raw, **kwargs):
|
||||
''' lxml *sometimes* represents non-ascii characters as hex entities in
|
||||
attribute values. I can't figure out exactly what circumstances cause it.
|
||||
It seems to happen when serializing a part of a larger tree. Since we need
|
||||
serialization to be the same when serializing full and partial trees, we
|
||||
manually replace all hex entities with their unicode codepoints. '''
|
||||
|
||||
xml_declaration = kwargs.pop('xml_declaration', False)
|
||||
encoding = kwargs.pop('encoding', 'UTF-8')
|
||||
kwargs['encoding'] = unicode
|
||||
kwargs['xml_declaration'] = False
|
||||
ans = etree.tostring(raw, **kwargs)
|
||||
if xml_declaration:
|
||||
ans = '<?xml version="1.0" encoding="%s"?>\n'%encoding + ans
|
||||
return re.sub(r'&#x([0-9A-Fa-f]+);', lambda m:mychr(int(m.group(1), 16)),
|
||||
ans).encode(encoding)
|
||||
|
||||
class Chunk(object):
|
||||
|
||||
def __init__(self, raw, parent_tag):
|
||||
self.raw = raw
|
||||
self.starts_tags = []
|
||||
self.ends_tags = []
|
||||
self.insert_pos = None
|
||||
self.parent_tag = parent_tag
|
||||
self.parent_is_body = False
|
||||
self.is_last_chunk = False
|
||||
self.is_first_chunk = False
|
||||
|
||||
def __len__(self):
|
||||
return len(self.raw)
|
||||
|
||||
def merge(self, chunk):
|
||||
self.raw += chunk.raw
|
||||
self.ends_tags = chunk.ends_tags
|
||||
|
||||
def __repr__(self):
|
||||
return 'Chunk(len=%r insert_pos=%r starts_tags=%r ends_tags=%r)'%(
|
||||
len(self.raw), self.insert_pos, self.starts_tags, self.ends_tags)
|
||||
|
||||
@property
|
||||
def selector(self):
|
||||
typ = 'S' if (self.is_last_chunk and not self.parent_is_body) else 'P'
|
||||
return "%s-//*[@aid='%s']"%(typ, self.parent_tag)
|
||||
|
||||
__str__ = __repr__
|
||||
|
||||
class Skeleton(object):
|
||||
|
||||
def __init__(self, file_number, item, root, chunks):
|
||||
self.file_number, self.item = file_number, item
|
||||
self.chunks = chunks
|
||||
|
||||
self.skeleton = self.render(root)
|
||||
self.body_offset = self.skeleton.find('<body')
|
||||
self.calculate_metrics(root)
|
||||
|
||||
self.calculate_insert_positions()
|
||||
|
||||
def render(self, root):
|
||||
raw = tostring(root, xml_declaration=True)
|
||||
raw = raw.replace(b'<html', bytes('<html xmlns="%s"'%XHTML_NS), 1)
|
||||
return raw
|
||||
|
||||
def calculate_metrics(self, root):
|
||||
Metric = namedtuple('Metric', 'start end')
|
||||
self.metrics = {}
|
||||
for tag in root.xpath('//*[@aid]'):
|
||||
text = (tag.text or '').encode('utf-8')
|
||||
raw = tostring(tag, with_tail=True)
|
||||
start_length = len(raw.partition(b'>')[0]) + len(text) + 1
|
||||
end_length = len(raw.rpartition(b'<')[-1]) + 1
|
||||
self.metrics[tag.get('aid')] = Metric(start_length, end_length)
|
||||
|
||||
def calculate_insert_positions(self):
|
||||
pos = self.body_offset
|
||||
for chunk in self.chunks:
|
||||
for tag in chunk.starts_tags:
|
||||
pos += self.metrics[tag].start
|
||||
chunk.insert_pos = pos
|
||||
pos += len(chunk)
|
||||
for tag in chunk.ends_tags:
|
||||
pos += self.metrics[tag].end
|
||||
|
||||
def rebuild(self):
|
||||
ans = self.skeleton
|
||||
for chunk in self.chunks:
|
||||
i = chunk.insert_pos
|
||||
ans = ans[:i] + chunk.raw + ans[i:]
|
||||
return ans
|
||||
|
||||
def __len__(self):
|
||||
return len(self.skeleton) + sum([len(x.raw) for x in self.chunks])
|
||||
|
||||
@property
|
||||
def raw_text(self):
|
||||
return b''.join([self.skeleton] + [x.raw for x in self.chunks])
|
||||
|
||||
class Chunker(object):
|
||||
|
||||
def __init__(self, oeb, data_func, placeholder_map):
|
||||
self.oeb, self.log = oeb, oeb.log
|
||||
self.data = data_func
|
||||
self.placeholder_map = placeholder_map
|
||||
|
||||
self.skeletons = []
|
||||
|
||||
# Set this to a list to enable dumping of the original and rebuilt
|
||||
# html files for debugging
|
||||
orig_dumps = None
|
||||
|
||||
for i, item in enumerate(self.oeb.spine):
|
||||
root = self.remove_namespaces(self.data(item))
|
||||
body = root.xpath('//body')[0]
|
||||
body.tail = '\n'
|
||||
|
||||
if orig_dumps is not None:
|
||||
orig_dumps.append(tostring(root, xml_declaration=True,
|
||||
with_tail=True))
|
||||
orig_dumps[-1] = close_self_closing_tags(
|
||||
orig_dumps[-1].replace(b'<html',
|
||||
bytes('<html xmlns="%s"'%XHTML_NS), 1))
|
||||
|
||||
# First pass: break up document into rendered strings of length no
|
||||
# more than CHUNK_SIZE
|
||||
chunks = []
|
||||
self.step_into_tag(body, chunks)
|
||||
|
||||
# Second pass: Merge neighboring small chunks within the same
|
||||
# skeleton tag so as to have chunks as close to the CHUNK_SIZE as
|
||||
# possible.
|
||||
chunks = self.merge_small_chunks(chunks)
|
||||
|
||||
# Third pass: Create the skeleton and calculate the insert position
|
||||
# for all chunks
|
||||
self.skeletons.append(Skeleton(i, item, root, chunks))
|
||||
|
||||
if orig_dumps:
|
||||
self.dump(orig_dumps)
|
||||
|
||||
# Create the SKEL and Chunk tables
|
||||
self.skel_table = []
|
||||
self.chunk_table = []
|
||||
self.create_tables()
|
||||
|
||||
# Set internal links
|
||||
text = b''.join(x.raw_text for x in self.skeletons)
|
||||
self.text = self.set_internal_links(text)
|
||||
|
||||
def remove_namespaces(self, root):
|
||||
lang = None
|
||||
for attr, val in root.attrib.iteritems():
|
||||
if attr.rpartition('}')[-1] == 'lang':
|
||||
lang = val
|
||||
|
||||
# Remove all namespace information from the tree. This means namespaced
|
||||
# tags have their namespaces removed and all namespace declarations are
|
||||
# removed. We have to do this manual cloning of the tree as there is no
|
||||
# other way to remove namespace declarations in lxml. This is done so
|
||||
# that serialization creates clean HTML 5 markup with no namespaces. We
|
||||
# insert the XHTML namespace manually after serialization. The
|
||||
# preceding layers should have removed svg and any other non html
|
||||
# namespaced tags.
|
||||
attrib = {'lang':lang} if lang else {}
|
||||
nroot = etree.Element('html', attrib=attrib)
|
||||
nroot.text = root.text
|
||||
nroot.tail = '\n'
|
||||
|
||||
for tag in root.iterdescendants(etree.Element):
|
||||
# We are ignoring all non tag entities in the tree
|
||||
# like comments and processing instructions, as they make the
|
||||
# chunking code even harder, for minimal gain.
|
||||
elem = nroot.makeelement(tag.tag.rpartition('}')[-1],
|
||||
attrib={k.rpartition('}')[-1]:v for k, v in
|
||||
tag.attrib.iteritems()})
|
||||
elem.text, elem.tail = tag.text, tag.tail
|
||||
parent = node_from_path(nroot, path_to_node(tag.getparent()))
|
||||
parent.append(elem)
|
||||
|
||||
return nroot
|
||||
|
||||
def step_into_tag(self, tag, chunks):
|
||||
aid = tag.get('aid')
|
||||
is_body = tag.tag == 'body'
|
||||
|
||||
first_chunk_idx = len(chunks)
|
||||
|
||||
# First handle any text
|
||||
if tag.text and tag.text.strip(): # Leave pure whitespace in the skel
|
||||
chunks.extend(self.chunk_up_text(tag.text, aid))
|
||||
tag.text = None
|
||||
|
||||
# Now loop over children
|
||||
for child in list(tag):
|
||||
raw = tostring(child, with_tail=False)
|
||||
raw = close_self_closing_tags(raw)
|
||||
if len(raw) > CHUNK_SIZE and child.get('aid', None):
|
||||
self.step_into_tag(child, chunks)
|
||||
if child.tail and child.tail.strip(): # Leave pure whitespace
|
||||
chunks.extend(self.chunk_up_text(child.tail, aid))
|
||||
child.tail = None
|
||||
else:
|
||||
if len(raw) > CHUNK_SIZE:
|
||||
self.log.warn('Tag %s has no aid and a too large chunk'
|
||||
' size. Adding anyway.'%child.tag)
|
||||
chunks.append(Chunk(raw, aid))
|
||||
if child.tail:
|
||||
chunks.extend(self.chunk_up_text(child.tail, aid))
|
||||
tag.remove(child)
|
||||
|
||||
if len(chunks) <= first_chunk_idx and chunks:
|
||||
raise ValueError('Stepped into a tag that generated no chunks.')
|
||||
|
||||
# Mark the first and last chunks of this tag
|
||||
if chunks:
|
||||
chunks[first_chunk_idx].starts_tags.append(aid)
|
||||
chunks[-1].ends_tags.append(aid)
|
||||
my_chunks = chunks[first_chunk_idx:]
|
||||
if my_chunks:
|
||||
my_chunks[0].is_first_chunk = True
|
||||
my_chunks[-1].is_last_chunk = True
|
||||
if is_body:
|
||||
for chunk in my_chunks:
|
||||
chunk.parent_is_body = True
|
||||
|
||||
def chunk_up_text(self, text, parent_tag):
|
||||
text = text.encode('utf-8')
|
||||
ans = []
|
||||
|
||||
def split_multibyte_text(raw):
|
||||
if len(raw) <= CHUNK_SIZE:
|
||||
return raw, b''
|
||||
l = raw[:CHUNK_SIZE]
|
||||
l = l.decode('utf-8', 'ignore').encode('utf-8')
|
||||
return l, raw[len(l):]
|
||||
|
||||
start, rest = split_multibyte_text(text)
|
||||
ans.append(start)
|
||||
while rest:
|
||||
start, rest = split_multibyte_text(rest)
|
||||
ans.append(b'<span class="AmznBigTextBlock">' + start + '</span>')
|
||||
return [Chunk(x, parent_tag) for x in ans]
|
||||
|
||||
def merge_small_chunks(self, chunks):
|
||||
ans = chunks[:1]
|
||||
for chunk in chunks[1:]:
|
||||
prev = ans[-1]
|
||||
if (
|
||||
chunk.starts_tags or # Starts a tag in the skel
|
||||
len(chunk) + len(prev) > CHUNK_SIZE or # Too large
|
||||
prev.ends_tags # Prev chunk ended a tag
|
||||
):
|
||||
ans.append(chunk)
|
||||
else:
|
||||
prev.merge(chunk)
|
||||
return ans
|
||||
|
||||
def create_tables(self):
|
||||
Skel = namedtuple('Skel',
|
||||
'file_number name chunk_count start_pos length')
|
||||
sp = 0
|
||||
for s in self.skeletons:
|
||||
s.start_pos = sp
|
||||
sp += len(s)
|
||||
self.skel_table = [Skel(s.file_number, 'SKEL%010d'%s.file_number,
|
||||
len(s.chunks), s.start_pos, len(s.skeleton)) for s in self.skeletons]
|
||||
|
||||
Chunk = namedtuple('Chunk',
|
||||
'insert_pos selector file_number sequence_number start_pos length')
|
||||
num = 0
|
||||
for skel in self.skeletons:
|
||||
cp = 0
|
||||
for chunk in skel.chunks:
|
||||
self.chunk_table.append(
|
||||
Chunk(chunk.insert_pos + skel.start_pos, chunk.selector,
|
||||
skel.file_number, num, cp, len(chunk.raw)))
|
||||
cp += len(chunk.raw)
|
||||
num += 1
|
||||
|
||||
def set_internal_links(self, text):
|
||||
''' Update the internal link placeholders to point to the correct
|
||||
location, based on the chunk table.'''
|
||||
# A kindle:pos:fid link contains two base 32 numbers of the form
|
||||
# XXXX:YYYYYYYYYY
|
||||
# The first number is an index into the chunk table and the second is
|
||||
# an offset from the start of the chunk to the start of the tag pointed
|
||||
# to by the link.
|
||||
aid_map = {} # Map of aid to (pos, fid)
|
||||
for match in re.finditer(br'<[^>]+? aid=[\'"]([A-Z0-9]+)[\'"]', text):
|
||||
offset = match.start()
|
||||
pos_fid = None
|
||||
for chunk in self.chunk_table:
|
||||
if chunk.insert_pos <= offset < chunk.insert_pos + chunk.length:
|
||||
pos_fid = (chunk.sequence_number, offset-chunk.insert_pos)
|
||||
break
|
||||
if chunk.insert_pos > offset:
|
||||
# This aid is in the skeleton, not in a chunk, so we use
|
||||
# the chunk immediately after
|
||||
pos_fid = (chunk.sequence_number, 0)
|
||||
break
|
||||
if pos_fid is None:
|
||||
raise ValueError('Could not find chunk for aid: %r'%
|
||||
match.group(1))
|
||||
aid_map[match.group(1)] = pos_fid
|
||||
|
||||
self.aid_offset_map = aid_map
|
||||
|
||||
def to_placeholder(aid):
|
||||
pos, fid = aid_map[aid]
|
||||
pos, fid = to_base(pos, min_num_digits=4), to_href(fid)
|
||||
return bytes(':off:'.join((pos, fid)))
|
||||
|
||||
placeholder_map = {bytes(k):to_placeholder(v) for k, v in
|
||||
self.placeholder_map.iteritems()}
|
||||
|
||||
# Now update the links
|
||||
def sub(match):
|
||||
raw = match.group()
|
||||
pl = match.group(1)
|
||||
try:
|
||||
return raw[:-19] + placeholder_map[pl]
|
||||
except KeyError:
|
||||
pass
|
||||
return raw
|
||||
|
||||
return re.sub(br'<[^>]+(kindle:pos:fid:0000:off:[0-9A-Za-z]{10})', sub,
|
||||
text)
|
||||
|
||||
def dump(self, orig_dumps):
|
||||
import tempfile, shutil, os
|
||||
tdir = os.path.join(tempfile.gettempdir(), 'skeleton')
|
||||
self.log('Skeletons dumped to:', tdir)
|
||||
if os.path.exists(tdir):
|
||||
shutil.rmtree(tdir)
|
||||
orig = os.path.join(tdir, 'orig')
|
||||
rebuilt = os.path.join(tdir, 'rebuilt')
|
||||
chunks = os.path.join(tdir, 'chunks')
|
||||
for x in (orig, rebuilt, chunks):
|
||||
os.makedirs(x)
|
||||
error = False
|
||||
for i, skeleton in enumerate(self.skeletons):
|
||||
for j, chunk in enumerate(skeleton.chunks):
|
||||
with open(os.path.join(chunks, 'file-%d-chunk-%d.html'%(i, j)),
|
||||
'wb') as f:
|
||||
f.write(chunk.raw)
|
||||
oraw, rraw = orig_dumps[i], skeleton.rebuild()
|
||||
with open(os.path.join(orig, '%04d.html'%i), 'wb') as f:
|
||||
f.write(oraw)
|
||||
with open(os.path.join(rebuilt, '%04d.html'%i), 'wb') as f:
|
||||
f.write(rraw)
|
||||
if oraw != rraw:
|
||||
error = True
|
||||
if error:
|
||||
raise ValueError('The before and after HTML differs. Run a diff '
|
||||
'tool on the orig and rebuilt directories')
|
||||
else:
|
||||
self.log('Skeleton HTML before and after is identical.')
|
||||
|
||||
|
109
src/calibre/ebooks/mobi/writer8/tbs.py
Normal file
109
src/calibre/ebooks/mobi/writer8/tbs.py
Normal file
@ -0,0 +1,109 @@
|
||||
#!/usr/bin/env python
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
from __future__ import (unicode_literals, division, absolute_import,
|
||||
print_function)
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
from collections import namedtuple
|
||||
from functools import partial
|
||||
|
||||
from calibre.ebooks.mobi.utils import (RECORD_SIZE, encode_trailing_data,
|
||||
encode_tbs)
|
||||
|
||||
Entry = namedtuple('IndexEntry', 'index start length depth parent '
|
||||
'first_child last_child title')
|
||||
Data = namedtuple('Data', 'starts ends completes spans')
|
||||
|
||||
def collect_indexing_data(entries, number_of_text_records):
|
||||
''' For every text record calculate which index entries start, end, span or
|
||||
are contained within that record.'''
|
||||
|
||||
data = []
|
||||
for i in xrange(number_of_text_records):
|
||||
record_start, next_record_start = i*RECORD_SIZE, (i+1)*RECORD_SIZE
|
||||
datum = Data([], [], [], [])
|
||||
data.append(datum)
|
||||
|
||||
for entry in entries:
|
||||
end = entry.start + entry.length - 1
|
||||
if (entry.start >= next_record_start or end < record_start):
|
||||
# This entry does not have any overlap with this record
|
||||
continue
|
||||
if (entry.start < record_start and end >= next_record_start):
|
||||
# This entry spans this record
|
||||
datum.spans.append(entry)
|
||||
continue
|
||||
if (entry.start >= record_start and end < next_record_start):
|
||||
# This entry is contained in this record
|
||||
datum.completes.append(entry)
|
||||
if (entry.start >= record_start and end >= next_record_start):
|
||||
# This entry starts in this record
|
||||
datum.starts.append(entry)
|
||||
continue
|
||||
if (entry.start < record_start and end < next_record_start):
|
||||
# This entry ends in this record
|
||||
datum.ends.append(entry)
|
||||
|
||||
for x in datum:
|
||||
# Should be unnecessary as entries are already in this order, but
|
||||
# best to be safe.
|
||||
x.sort(key=lambda x:x.depth)
|
||||
|
||||
return data
|
||||
|
||||
def generate_tbs_for_flat_index(indexing_data):
|
||||
ans = []
|
||||
record_type = 8 # 8 for KF8 0 for MOBI 6
|
||||
enc = partial(encode_tbs, flag_size=3)
|
||||
for datum in indexing_data:
|
||||
tbs = b''
|
||||
extra = {0b010 : record_type}
|
||||
if not (datum.starts or datum.ends or datum.completes or datum.spans):
|
||||
# No index entry touches this record
|
||||
pass
|
||||
elif datum.spans:
|
||||
extra[0b001] = 0
|
||||
tbs = enc(datum.spans[0].index, extra)
|
||||
else:
|
||||
starts, ends, completes = datum[:3]
|
||||
if (not completes and len(starts) + len(ends) == 1):
|
||||
# Either has the first or the last index, and no other indices.
|
||||
node = (starts+ends)[0]
|
||||
tbs = enc(node.index, extra)
|
||||
else:
|
||||
# This record contains the end of an index and
|
||||
# some complete index entries. Or it contains some complete
|
||||
# entries and a start. Or it contains an end, a start and
|
||||
# optionally some completes. In every case, we encode the first
|
||||
# entry to touch this record and the number of entries
|
||||
# that touch this record.
|
||||
nodes = starts + completes + ends
|
||||
nodes.sort(key=lambda x:x.index)
|
||||
extra[0b100] = len(nodes)
|
||||
tbs = enc(nodes[0].index, extra)
|
||||
ans.append(tbs)
|
||||
|
||||
return ans
|
||||
|
||||
def apply_trailing_byte_sequences(index_table, records, number_of_text_records):
|
||||
entries = tuple(Entry(r['index'], r['offset'], r['length'], r['depth'],
|
||||
r.get('parent', None), r.get('first_child', None), r.get('last_child',
|
||||
None), r['label']) for r in index_table)
|
||||
|
||||
indexing_data = collect_indexing_data(entries, number_of_text_records)
|
||||
max_depth = max(e['depth'] for e in index_table)
|
||||
if max_depth > 0:
|
||||
# TODO: Implement for hierarchical ToCs
|
||||
tbs = []
|
||||
else:
|
||||
tbs = generate_tbs_for_flat_index(indexing_data)
|
||||
if not tbs:
|
||||
return False
|
||||
for i, tbs_bytes in enumerate(tbs):
|
||||
records[i+1] += encode_trailing_data(tbs_bytes)
|
||||
return True
|
||||
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user