[Sync] Sync with trunk. Revision 10065.

This commit is contained in:
Li Fanxi 2011-08-04 01:23:08 +08:00
commit 96fa2af7cd
266 changed files with 119899 additions and 70098 deletions

View File

@ -19,6 +19,191 @@
# new recipes:
# - title:
- version: 0.8.12
date: 2011-07-29
new features:
- title: "Content server: Return the correct last modified date when serving ebook files. Also allow getting of book metadata as /get/opf/<book_id>"
- title: "Driver for the COBY MP977"
- title: "Get Books: Remove epub bud store. Add Ozon.ru and e-knigni.net stores. Fix broken amazon UK and DE stores."
tickets: [816091]
- title: "Add a new tweak to Preferences->Tweaks that allows auto generation of series numbers when importing books with a series name, but no number"
tickets: [815573]
bug fixes:
- title: "Fix a regression in 0.8.11 that broke calibre on linux systems that use a file system encoding that cannot support cyrillic characters"
tickets: [815224]
- title: "Fix long titles not wrapping in cover browser"
tickets: [816595]
- title: "When adding books, handle the case of files without read permission more gracefully."
tickets: [814771]
- title: "When changing metadata in EPUB files do not use the opf: namespace prefix on newly created elements. Apparently, FBReaderJ doesn't understand XML namespaces."
tickets: [814722]
- title: "Prevent metadata download from returning published dates earlier than 101 A.D."
- title: "Fix a bug where dates before 101AD in the database could cause errors"
tickets: [814964]
- title: "Fix an error in the book details panel if the user sets the default author link to blank"
improved recipes:
- The Economist
- Instapaper
- Corren
new recipes:
- title: Counterpunch
author: O. Emmerson
- title: National Geographic (PL)
author: Marcin Urban
- title: Caros Amigos
author: Pablo Aldama
- title: Aksiyon Dergisi
author: thomass
- title: Dnevnik (MK) and +Info
author: Darko Spasovski
- title: Dagens Industri
author: Jonas Svensson
- version: 0.8.11
date: 2011-07-22
new features:
- title: "When doing a conversion from some format to the same format, save the original file"
description: "When calibre does a conversion from the same format to the same format, for
example, from EPUB to EPUB, the original file is saved as original_epub, so that in case the
conversion is poor, you can change the settings and run it again. The original is automatically used
every time you run a conversion with that format as input. If you want to disable this,
there is a tweak that prevents calibre from saving the originals in Preferences->Tweaks. You can
easily replace the converted version with the original in the Edit metadata dialog by right
clicking on the list of formats in the top right corner."
type: major
- title: "Conversion pipeline: Add an option to control the height of the blank lines inserted by calibre"
- title: "Drivers for bq DaVinci, Samsung Galaxy ACE GT-S5830 and Medion e-reader"
- title: "Get Books: Add stores Chitanka and Bookoteka. Remove epubbuy.de at store's request"
- title: "Content server: Add a link at the bottom of the mobile interface to switch to the full interface."
tickets: [812525]
- title: "Update the kindle icon shown when a Kindle is connected to use a picture of the Kindle 3"
tickets: [810852]
- title: "MOBI Output: When converting epub documents that have a start element in their guide, use it to mark the starting position at which the MOBI file will be opened."
tickets: [804755]
- title: "News download: Add a default Accept header to all requests"
bug fixes:
- title: "Fix regression that broke loading translations from .po files in the working directory"
- title: "Fix conversion dialog not allowing series numbers larger than 9999"
tickets: [813281]
- title: "Conversion pipeline: When adding/removing entries to the manifest, ignore unparseable URLs instead of erroring out on them"
- title: "SD Card in Azbooka not being detected"
tickets: [812750]
- title: "Conversion pipeline: Strip out large blocks of contiguous space (more than 10000 contiguous blanks) as these slow down the conversion process and are almost always indicative of an error in the input document."
- title: "ebook-convert: Abort if a keyboard interrupt is raised during parsing"
- title: "Regex builder: Show a nicer error message when the user has the file open in another program on windows."
tickets: [811641]
- title: "When converting in the GUI, set all identifiers present in the book's metadata in the output file, if the output format supports them."
improved recipes:
- NBObline
- JBPress
- Instapaper
- Die Zeit
- Wired (UK)
new recipes:
- title: Utrinski Vesnik
author: Darko Spasovski
- title: IDG.se
author: zapt0
- title: Los Andes
author: Darko Miletic
- title: De Luns a Venres
author: Susana Sotelo Docío
- title: "Nikkei News subscription version"
author: Ado Nishimura
- version: 0.8.10
date: 2011-07-15
new features:
- title: "Add a right click menu to the cover browser. It allows you to view a book, edit metadata etc. from within the cover browser. The menu can be customized in Preferences->Toolbars"
- title: "Allow selecting and stopping multiple jobs at once in the jobs window"
tickets: [810349]
- title: "When editing metadata directly in the book list, have a little pop up menu so that all existing values can be accessed by mouse only. For example, when you edit authors, you can use the mouse to select an existing author."
- title: "Get Books: Add ebook.nl and fix price parsing for the legimi store"
- title: "Drivers for Samsung Infuse and Motorola XPERT"
- title: "Tag Browser: Make hierarchical items work in group searched terms."
bug fixes:
- title: "Allow setting numbers larger than 99 in custom series columns"
- title: "Fix a bug that caused the same news download sent via a USB connection to the device on two different days resulting in a duplicate on the device"
- title: "Ensure English in the list of interface languages in Preferences is always listed in English, so that it does not become hard to find"
- title: "SNB Output: Fix bug in handling unicode file names"
- title: "Fix sorting problem in manage categories. Fix poor performance problem when dropping multiple books onto a user category."
- title: "Remove 'empty field' error dialogs in bulk search/replace, instead setting the fields to their default value."
- title: "Fix regression that broke communicating with Kobo devices using outdated firmware"
tickets: [807832]
- title: "LRF Input: Fix conversion of LRF files with non ascii titles on some windows systems"
tickets: [807641]
improved recipes:
- Time
- Freakonomics Blog
- io9
- "Computer Act!ve"
new recipes:
- title: Techcrunch and Pecat
author: Darko Miletic
- title: Vio Mundo, IDG Now and Tojolaco
author: Diniz Bortoletto
- title: Geek and Poke, Automatiseringgids IT
author: DrMerry
- version: 0.8.9
date: 2011-07-08
@ -32,7 +217,7 @@
- title: "Conversion pipeline: Add option to control if duplicate entries are allowed when generating the Table of Contents from links."
tickets: [806095]
- title: "Metadata download: When merging results, if the query to the xisbn service hangs, wait no more than 10 seconds. Also try harder to preserve the month when downlaoding published date. Do not throw away isbnless results if there are some sources that return isbns and some that do not."
- title: "Metadata download: When merging results, if the query to the xisbn service hangs, wait no more than 10 seconds. Also try harder to preserve the month when downloading published date. Do not throw away isbnless results if there are some sources that return isbns and some that do not."
tickets: [798309]
- title: "Get Books: Remove OpenLibrary since it has the same files as archive.org. Allow direct downloading from Project Gutenberg."
@ -617,7 +802,7 @@
- version: 0.8.0
date: 2010-05-06
date: 2011-05-06
new features:
- title: "Go to http://calibre-ebook.com/new-in/eight to see what's new in 0.8.0"

758
imgsrc/random.svg Normal file
View File

@ -0,0 +1,758 @@
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<!-- Created with Inkscape (http://www.inkscape.org/) -->
<svg
xmlns:dc="http://purl.org/dc/elements/1.1/"
xmlns:cc="http://creativecommons.org/ns#"
xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
xmlns:svg="http://www.w3.org/2000/svg"
xmlns="http://www.w3.org/2000/svg"
xmlns:xlink="http://www.w3.org/1999/xlink"
xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
width="128"
height="128"
id="svg10643"
sodipodi:version="0.32"
inkscape:version="0.46+devel"
sodipodi:docname="pointer.svgz"
inkscape:output_extension="org.inkscape.output.svgz.inkscape"
inkscape:export-filename="/home/pinheiro/pics/oxygen-icons/scalable/actions/small/32x32/pointer.png"
inkscape:export-xdpi="90"
inkscape:export-ydpi="90"
version="1.0">
<defs
id="defs10645">
<inkscape:perspective
sodipodi:type="inkscape:persp3d"
inkscape:vp_x="0 : 12 : 1"
inkscape:vp_y="0 : 1000 : 0"
inkscape:vp_z="24 : 12 : 1"
inkscape:persp3d-origin="12 : 8 : 1"
id="perspective108" />
<linearGradient
id="linearGradient3233">
<stop
style="stop-color:#ffffff;stop-opacity:1;"
offset="0"
id="stop3235" />
<stop
style="stop-color:#ffffff;stop-opacity:0;"
offset="1"
id="stop3237" />
</linearGradient>
<linearGradient
id="linearGradient3866">
<stop
id="stop3868"
offset="0"
style="stop-color:#fff299;stop-opacity:1;" />
<stop
id="stop3870"
offset="1"
style="stop-color:#dcd8bd;stop-opacity:0;" />
</linearGradient>
<linearGradient
id="linearGradient11059">
<stop
style="stop-color:#727272;stop-opacity:1;"
offset="0"
id="stop11061" />
<stop
id="stop11067"
offset="0.5"
style="stop-color:#a6a6a6;stop-opacity:1;" />
<stop
style="stop-color:#cdcdcd;stop-opacity:1;"
offset="0.75"
id="stop11069" />
<stop
style="stop-color:#acacac;stop-opacity:1;"
offset="1"
id="stop11063" />
</linearGradient>
<linearGradient
id="linearGradient10925">
<stop
style="stop-color:#bf0303;stop-opacity:0;"
offset="0"
id="stop10927" />
<stop
id="stop10978"
offset="0.39309064"
style="stop-color:#bf0303;stop-opacity:0;" />
<stop
id="stop10935"
offset="0.46538317"
style="stop-color:#bf0303;stop-opacity:0.49803922;" />
<stop
style="stop-color:#bf0303;stop-opacity:1;"
offset="0.5"
id="stop10976" />
<stop
id="stop10933"
offset="0.5"
style="stop-color:#bf0303;stop-opacity:1;" />
<stop
style="stop-color:#bf0303;stop-opacity:0.49803922;"
offset="0.55339807"
id="stop10937" />
<stop
id="stop10980"
offset="0.60542935"
style="stop-color:#bf0303;stop-opacity:0;" />
<stop
style="stop-color:#bf0303;stop-opacity:0;"
offset="1"
id="stop10929" />
</linearGradient>
<linearGradient
id="linearGradient10901">
<stop
id="stop10903"
offset="0"
style="stop-color:#fff299;stop-opacity:0;" />
<stop
style="stop-color:#fff299;stop-opacity:1;"
offset="0.5"
id="stop10909" />
<stop
id="stop10905"
offset="1"
style="stop-color:#fff299;stop-opacity:0;" />
</linearGradient>
<linearGradient
id="linearGradient10854">
<stop
style="stop-color:#000000;stop-opacity:1;"
offset="0"
id="stop10856" />
<stop
id="stop10862"
offset="0.5"
style="stop-color:#000000;stop-opacity:0;" />
<stop
style="stop-color:#000000;stop-opacity:1;"
offset="1"
id="stop10858" />
</linearGradient>
<linearGradient
id="linearGradient10711">
<stop
style="stop-color:#ffffff;stop-opacity:1;"
offset="0"
id="stop10713" />
<stop
style="stop-color:#ffffff;stop-opacity:0;"
offset="1"
id="stop10715" />
</linearGradient>
<radialGradient
inkscape:collect="always"
xlink:href="#linearGradient10711"
id="radialGradient10875"
gradientUnits="userSpaceOnUse"
gradientTransform="matrix(0.8967678,0.05935673,-0.05873468,0.8873664,-5.4012494,0.1392525)"
spreadMethod="reflect"
cx="18.708233"
cy="24.759357"
fx="18.708233"
fy="24.759357"
r="13.169441" />
<radialGradient
inkscape:collect="always"
xlink:href="#linearGradient10925"
id="radialGradient10931"
cx="9.996233"
cy="23.364098"
fx="7.6629176"
fy="18.295921"
r="8.7188435"
gradientTransform="matrix(3.0577456,1.8802807,-0.9054531,1.4724637,3.4545267,-24.480143)"
gradientUnits="userSpaceOnUse" />
<radialGradient
inkscape:collect="always"
xlink:href="#linearGradient10711"
id="radialGradient10968"
gradientUnits="userSpaceOnUse"
gradientTransform="matrix(0.8967678,0.05935673,-0.05873468,0.8873664,-5.4012494,0.1392525)"
spreadMethod="reflect"
cx="18.708233"
cy="24.759357"
fx="18.708233"
fy="24.759357"
r="13.169441" />
<radialGradient
inkscape:collect="always"
xlink:href="#linearGradient10925"
id="radialGradient10971"
gradientUnits="userSpaceOnUse"
gradientTransform="matrix(2.7002217,0.5715519,-0.4374946,2.0668853,-4.8632848,-26.818351)"
cx="9.1802711"
cy="24.942194"
fx="6.0336409"
fy="17.669048"
r="8.7188435" />
<clipPath
clipPathUnits="userSpaceOnUse"
id="clipPath10999">
<path
sodipodi:nodetypes="ccccc"
id="path11001"
d="M 3.6413483,1.9681703 3.779696,17.490509 14.887308,19.785771 21.079035,17.498126 3.6413483,1.9681703 z"
style="fill:#ff80ff;fill-opacity:1;fill-rule:evenodd;stroke:none" />
</clipPath>
<radialGradient
inkscape:collect="always"
xlink:href="#linearGradient10925"
id="radialGradient11003"
gradientUnits="userSpaceOnUse"
gradientTransform="matrix(2.7002217,0.5715519,-0.4374946,2.0668853,-4.8632848,-26.818351)"
cx="8.2921495"
cy="23.935163"
fx="8.2488832"
fy="19.781427"
r="8.7188435" />
<radialGradient
inkscape:collect="always"
xlink:href="#linearGradient10925"
id="radialGradient11030"
gradientUnits="userSpaceOnUse"
gradientTransform="matrix(2.7002217,0.5715519,-0.4374946,2.0668853,-4.8632848,-26.818351)"
cx="8.2921495"
cy="23.935163"
fx="8.2488832"
fy="19.781427"
r="8.7188435" />
<radialGradient
inkscape:collect="always"
xlink:href="#linearGradient10711"
id="radialGradient11032"
gradientUnits="userSpaceOnUse"
gradientTransform="matrix(0.8967678,0.05935673,-0.05873468,0.8873664,-5.4012494,0.1392525)"
spreadMethod="reflect"
cx="18.708233"
cy="24.759357"
fx="18.708233"
fy="24.759357"
r="13.169441" />
<radialGradient
inkscape:collect="always"
xlink:href="#linearGradient10925"
id="radialGradient11034"
gradientUnits="userSpaceOnUse"
gradientTransform="matrix(2.7002217,0.5715519,-0.4374946,2.0668853,-4.8632848,-26.818351)"
cx="8.2921495"
cy="23.935163"
fx="8.2488832"
fy="19.781427"
r="8.7188435" />
<radialGradient
inkscape:collect="always"
xlink:href="#linearGradient10711"
id="radialGradient3294"
gradientUnits="userSpaceOnUse"
gradientTransform="matrix(0.7030075,0.6357809,-0.8060735,0.8913044,14.84311,-8.1934483)"
spreadMethod="reflect"
cx="16.993044"
cy="20.648924"
fx="16.993044"
fy="20.648924"
r="13.169441" />
<linearGradient
inkscape:collect="always"
xlink:href="#linearGradient10711"
id="linearGradient3297"
gradientUnits="userSpaceOnUse"
gradientTransform="matrix(0.9823337,0,0,0.9823337,0.03300739,0.6182451)"
spreadMethod="pad"
x1="19.879225"
y1="12.061514"
x2="16.034332"
y2="15.552854" />
<linearGradient
inkscape:collect="always"
xlink:href="#linearGradient10711"
id="linearGradient3353"
gradientUnits="userSpaceOnUse"
gradientTransform="matrix(0.9823337,0,0,0.9823337,0.03300739,0.6182451)"
spreadMethod="pad"
x1="19.879225"
y1="12.061514"
x2="16.034332"
y2="15.552854" />
<radialGradient
inkscape:collect="always"
xlink:href="#linearGradient10711"
id="radialGradient3355"
gradientUnits="userSpaceOnUse"
gradientTransform="matrix(0.7030075,0.6357809,-0.8060735,0.8913044,14.84311,-8.1934483)"
spreadMethod="reflect"
cx="16.993044"
cy="20.648924"
fx="16.993044"
fy="20.648924"
r="13.169441" />
<linearGradient
inkscape:collect="always"
xlink:href="#linearGradient10711"
id="linearGradient3362"
gradientUnits="userSpaceOnUse"
gradientTransform="matrix(0.9823337,0,0,0.9823337,0.03300739,0.6182451)"
spreadMethod="pad"
x1="19.879225"
y1="12.061514"
x2="16.034332"
y2="15.552854" />
<radialGradient
inkscape:collect="always"
xlink:href="#linearGradient10711"
id="radialGradient3364"
gradientUnits="userSpaceOnUse"
gradientTransform="matrix(0.8341244,0.2489558,-0.2435026,0.8158514,0.7851109,-0.01382395)"
spreadMethod="reflect"
cx="17.54755"
cy="21.708042"
fx="17.54755"
fy="21.708042"
r="13.169441" />
<radialGradient
inkscape:collect="always"
xlink:href="#linearGradient10711"
id="radialGradient3367"
gradientUnits="userSpaceOnUse"
gradientTransform="matrix(0.8151394,0.2358626,-0.2306962,0.7972824,0.7011221,-1.0582457)"
spreadMethod="reflect"
cx="17.54755"
cy="21.708042"
fx="17.54755"
fy="21.708042"
r="13.169441" />
<linearGradient
inkscape:collect="always"
xlink:href="#linearGradient10711"
id="linearGradient3370"
gradientUnits="userSpaceOnUse"
gradientTransform="matrix(0.9575785,-0.00803118,0.00803118,0.9575785,-0.0268605,-0.4359562)"
spreadMethod="pad"
x1="19.879225"
y1="12.061514"
x2="16.034332"
y2="15.552854" />
<linearGradient
y2="19.626715"
x2="10.711697"
y1="18.63658"
x1="9.7192469"
gradientTransform="matrix(3.547255,-0.03993894,0.03993894,3.547255,-27.397339,-48.790495)"
gradientUnits="userSpaceOnUse"
id="linearGradient3488"
xlink:href="#linearGradient10711"
inkscape:collect="always" />
<radialGradient
r="1.15625"
fy="20.478674"
fx="11.413477"
cy="20.478674"
cx="11.413477"
spreadMethod="pad"
gradientTransform="matrix(1.7083003,-0.01851949,0.01798426,1.6589328,-8.4797796,-13.189665)"
gradientUnits="userSpaceOnUse"
id="radialGradient3486"
xlink:href="#linearGradient3330"
inkscape:collect="always" />
<linearGradient
y2="19.626715"
x2="10.711697"
y1="18.63658"
x1="9.7192469"
gradientTransform="matrix(3.5474799,0,0,3.5474799,-26.927898,-62.356391)"
gradientUnits="userSpaceOnUse"
id="linearGradient3475"
xlink:href="#linearGradient10711"
inkscape:collect="always" />
<radialGradient
r="1.15625"
fy="20.478674"
fx="11.413477"
cy="20.478674"
cx="11.413477"
spreadMethod="pad"
gradientTransform="matrix(1.7083003,-0.01851949,0.01798426,1.6589328,-8.4797796,-13.189665)"
gradientUnits="userSpaceOnUse"
id="radialGradient3473"
xlink:href="#linearGradient3330"
inkscape:collect="always" />
<radialGradient
spreadMethod="reflect"
gradientUnits="userSpaceOnUse"
gradientTransform="matrix(0.7809876,0.01449707,-0.0055455,0.2987498,-0.2924169,2.0957246)"
r="11.765625"
fy="10.911069"
fx="1.1416299"
cy="10.911069"
cx="1.1416299"
id="radialGradient3317"
xlink:href="#linearGradient3206"
inkscape:collect="always" />
<linearGradient
y2="26.641653"
x2="16.836901"
y1="6.8943019"
x1="5.6869311"
gradientTransform="translate(0,-7.2094174)"
gradientUnits="userSpaceOnUse"
id="linearGradient3265"
xlink:href="#linearGradient3267"
inkscape:collect="always" />
<linearGradient
y2="17.133453"
x2="16.836901"
y1="-2.6138983"
x1="5.6869311"
gradientTransform="translate(0,2.298783)"
gradientUnits="userSpaceOnUse"
id="linearGradient3261"
xlink:href="#linearGradient3267"
inkscape:collect="always" />
<linearGradient
gradientTransform="translate(0,-4.8361309)"
y2="24.268368"
x2="16.836901"
y1="4.5210156"
x1="5.6869311"
gradientUnits="userSpaceOnUse"
id="linearGradient3257"
xlink:href="#linearGradient3267"
inkscape:collect="always" />
<linearGradient
gradientTransform="translate(0,-2.4628444)"
y2="21.895081"
x2="16.836901"
y1="2.1477292"
x1="5.6869311"
gradientUnits="userSpaceOnUse"
id="linearGradient3249"
xlink:href="#linearGradient3267"
inkscape:collect="always" />
<linearGradient
gradientUnits="userSpaceOnUse"
y2="19.432236"
x2="16.836901"
y1="-0.31511527"
x1="5.6869311"
id="linearGradient3239"
xlink:href="#linearGradient3267"
inkscape:collect="always" />
<linearGradient
y2="19.626715"
x2="10.711697"
y1="18.384007"
x1="9.8687286"
gradientTransform="matrix(3.6334443,0,0,3.6334443,-27.580699,-51.677773)"
gradientUnits="userSpaceOnUse"
id="linearGradient3220"
xlink:href="#linearGradient10711"
inkscape:collect="always" />
<radialGradient
r="1.15625"
fy="20.478674"
fx="11.413477"
cy="20.478674"
cx="11.413477"
spreadMethod="pad"
gradientTransform="matrix(1.7083003,-0.01851949,0.01798426,1.6589328,-8.4797796,-13.189665)"
gradientUnits="userSpaceOnUse"
id="radialGradient3218"
xlink:href="#linearGradient10711"
inkscape:collect="always" />
<linearGradient
id="linearGradient2657">
<stop
id="stop2659"
offset="0"
style="stop-color:#ff80ff;stop-opacity:1;" />
<stop
id="stop2661"
offset="1"
style="stop-color:#ff80ff;stop-opacity:0;" />
</linearGradient>
<linearGradient
id="linearGradient3206">
<stop
style="stop-color:#b1d28f;stop-opacity:1;"
offset="0"
id="stop3208" />
<stop
style="stop-color:#b1d28f;stop-opacity:1;"
offset="1"
id="stop3210" />
</linearGradient>
<linearGradient
id="linearGradient3241">
<stop
id="stop3243"
offset="0"
style="stop-color:#000000;stop-opacity:1;" />
<stop
id="stop3245"
offset="1"
style="stop-color:#debc85;stop-opacity:0" />
</linearGradient>
<linearGradient
id="linearGradient3267">
<stop
style="stop-color:#debc85;stop-opacity:1;"
offset="0"
id="stop3269" />
<stop
style="stop-color:#debc85;stop-opacity:0;"
offset="1"
id="stop3271" />
</linearGradient>
<linearGradient
id="linearGradient3273">
<stop
style="stop-color:#000000;stop-opacity:1;"
offset="0"
id="stop3275" />
<stop
style="stop-color:#debc85;stop-opacity:0"
offset="1"
id="stop3277" />
</linearGradient>
<linearGradient
id="linearGradient3279">
<stop
style="stop-color:#000000;stop-opacity:1;"
offset="0"
id="stop3281" />
<stop
style="stop-color:#debc85;stop-opacity:0"
offset="1"
id="stop3283" />
</linearGradient>
<linearGradient
id="linearGradient3285">
<stop
style="stop-color:#000000;stop-opacity:1;"
offset="0"
id="stop3287" />
<stop
style="stop-color:#debc85;stop-opacity:0"
offset="1"
id="stop3289" />
</linearGradient>
<linearGradient
id="linearGradient3330">
<stop
style="stop-color:#ff80ff;stop-opacity:0;"
offset="0"
id="stop3332" />
<stop
style="stop-color:#666666;stop-opacity:1;"
offset="1"
id="stop3334" />
</linearGradient>
<radialGradient
inkscape:collect="always"
xlink:href="#linearGradient10711"
id="radialGradient4021"
gradientUnits="userSpaceOnUse"
gradientTransform="matrix(0.9318803,-0.2210697,0.2308678,0.9731826,-3.9252239,2.7241703)"
spreadMethod="pad"
cx="11.074039"
cy="20.428291"
fx="11.074039"
fy="20.428291"
r="1.15625" />
<linearGradient
inkscape:collect="always"
xlink:href="#linearGradient10711"
id="linearGradient4023"
gradientUnits="userSpaceOnUse"
gradientTransform="matrix(0.8514941,0.5243642,-0.5243642,0.8514941,24.154135,2.8247022)"
x1="21.461079"
y1="23.349636"
x2="22.96941"
y2="28.038134" />
<linearGradient
inkscape:collect="always"
xlink:href="#linearGradient10711"
id="linearGradient4030"
gradientUnits="userSpaceOnUse"
gradientTransform="matrix(0.8514941,0.5243642,-0.5243642,0.8514941,18.007546,-15.657615)"
x1="21.461079"
y1="23.349636"
x2="22.96941"
y2="28.038134" />
<filter
inkscape:collect="always"
x="-0.20028582"
width="1.4005716"
y="-0.11837127"
height="1.2367425"
id="filter3484">
<feGaussianBlur
inkscape:collect="always"
stdDeviation="0.97202214"
id="feGaussianBlur3486" />
</filter>
<radialGradient
inkscape:collect="always"
xlink:href="#linearGradient10711"
id="radialGradient3490"
gradientUnits="userSpaceOnUse"
gradientTransform="matrix(1.1086176,-0.4093269,0.6608062,1.7897223,-9.2289678,-4.0397151)"
spreadMethod="reflect"
cx="8.8133469"
cy="14.235861"
fx="8.8133469"
fy="14.235861"
r="5.3238101" />
<clipPath
clipPathUnits="userSpaceOnUse"
id="clipPath3496">
<rect
style="opacity:0.62633481;fill:none;stroke:#000000;stroke-width:0.19602102;stroke-linecap:round;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none"
id="rect3498"
width="13.277639"
height="22.63365"
x="5.309958"
y="1.2316679"
ry="1.171887" />
</clipPath>
<radialGradient
inkscape:collect="always"
xlink:href="#linearGradient10711"
id="radialGradient3508"
gradientUnits="userSpaceOnUse"
gradientTransform="matrix(1.1106976,-0.4036489,0.6516398,1.7930801,-9.2127514,-4.7972628)"
spreadMethod="reflect"
cx="8.8133469"
cy="14.235861"
fx="8.8133469"
fy="14.235861"
r="5.3238101" />
<linearGradient
inkscape:collect="always"
xlink:href="#linearGradient3233"
id="linearGradient3240"
x1="9.4485903"
y1="2.761672"
x2="7.6776314"
y2="19.013866"
gradientUnits="userSpaceOnUse"
gradientTransform="matrix(4.1741381,0,0,4.1613891,14.977639,14.527008)" />
<radialGradient
inkscape:collect="always"
xlink:href="#linearGradient10711"
id="radialGradient3253"
gradientUnits="userSpaceOnUse"
gradientTransform="matrix(4.7157796,-1.4428762,2.6160831,8.4980426,-22.699134,-22.277012)"
spreadMethod="reflect"
cx="8.2230186"
cy="14.316785"
fx="8.2230186"
fy="14.316785"
r="5.3238101" />
<filter
inkscape:collect="always"
id="filter3757"
x="-0.14567212"
width="1.2913442"
y="-0.098205952"
height="1.1964119">
<feGaussianBlur
inkscape:collect="always"
stdDeviation="0.79012916"
id="feGaussianBlur3759" />
</filter>
</defs>
<sodipodi:namedview
id="base"
pagecolor="#ffffff"
bordercolor="#666666"
borderopacity="1.0"
inkscape:pageopacity="0.0"
inkscape:pageshadow="2"
inkscape:zoom="2"
inkscape:cx="8.5584572"
inkscape:cy="52.628863"
inkscape:current-layer="layer1"
showgrid="true"
inkscape:grid-bbox="true"
inkscape:document-units="px"
inkscape:window-width="1280"
inkscape:window-height="742"
inkscape:window-x="296"
inkscape:window-y="56"
showguides="true"
inkscape:guide-bbox="true"
width="24px"
height="24px"
inkscape:object-paths="false"
inkscape:object-nodes="true"
inkscape:snap-nodes="false"
inkscape:snap-global="false">
<inkscape:grid
type="xygrid"
id="grid3664"
empspacing="2"
visible="true"
enabled="true"
spacingx="2.6666px"
spacingy="2.6666px" />
<sodipodi:guide
orientation="1,0"
position="10.507812,7.328125"
id="guide3666" />
<sodipodi:guide
orientation="0,1"
position="10.292968,7.5546875"
id="guide3668" />
</sodipodi:namedview>
<metadata
id="metadata10648">
<rdf:RDF>
<cc:Work
rdf:about="">
<dc:format>image/svg+xml</dc:format>
<dc:type
rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
<dc:title></dc:title>
</cc:Work>
</rdf:RDF>
</metadata>
<g
id="layer1"
inkscape:label="Layer 1"
inkscape:groupmode="layer">
<path
style="font-size:medium;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-indent:0;text-align:start;text-decoration:none;line-height:normal;letter-spacing:normal;word-spacing:normal;text-transform:none;direction:ltr;block-progression:tb;writing-mode:lr-tb;text-anchor:start;opacity:0.18099551;color:#000000;fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:none;stroke-width:2;marker:none;visibility:visible;display:inline;overflow:visible;filter:url(#filter3757);enable-background:accumulate;font-family:Bitstream Vera Sans;-inkscape-font-specification:Bitstream Vera Sans"
d="m 5,19 c 5.24e-5,0.523584 0.4764155,0.999948 1,1 l 3.59375,0 2.5,2.5625 c 0.272702,0.267764 0.706204,0.357015 1.0625,0.21875 l 1.25,-0.46875 c 0.353635,-0.127466 0.619754,-0.46962 0.65625,-0.84375 l 0.34375,-3.3125 2.40625,-3 c 0.296435,-0.374818 0.26821,-0.967546 -0.0625,-1.3125 L 5.2034921,1.0488435 5,19 z"
id="path3670"
sodipodi:nodetypes="cccccccccccc"
transform="matrix(3.8351065,0,0,3.8305733,20.000787,24.35592)" />
<path
style="fill:#201020;fill-rule:evenodd;stroke:#595959;stroke-width:5.33333349;stroke-linecap:butt;stroke-linejoin:round;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none"
d="m 40.022468,18.688398 0,66.582224 16.696553,0 L 68.436432,97.122145 73.70512,95.153883 75.247683,80.05609 86.03678,66.636329 40.022468,18.688398 z"
id="path3502"
sodipodi:nodetypes="cccccccc" />
<path
sodipodi:nodetypes="cccccccc"
id="path3504"
d="m 40.022468,18.688397 0,66.58222 16.696554,0 11.717412,11.851511 5.268688,-1.968253 1.54256,-15.09779 L 85.892267,66.67168 40.022468,18.688397 z"
style="fill:#c4c4c4;fill-opacity:1;fill-rule:evenodd;stroke:none" />
<path
style="fill:url(#radialGradient3253);fill-opacity:1;fill-rule:evenodd;stroke:none"
d="m 40.022468,18.688397 0,66.58222 16.696554,0 11.717412,11.851511 5.268688,-1.968253 1.54256,-15.09779 10.56036,-13.22296 -45.785574,-48.144728 z"
id="path3506"
sodipodi:nodetypes="cccccccc" />
<path
style="fill:none;stroke:url(#linearGradient3240);stroke-width:2.667;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none"
d="M 41.38037,83.915714 58.396539,84.11082 69.204513,94.929961 72.193882,93.656427 73.839587,79.399575 84.08764,66.742537 41.448646,22.246495 41.38037,83.915714 z"
id="path2253"
sodipodi:nodetypes="cccccccc" />
</g>
</svg>

After

Width:  |  Height:  |  Size: 25 KiB

View File

@ -0,0 +1,53 @@
# -*- coding: utf-8 -*-
from calibre.web.feeds.news import BasicNewsRecipe
class Aksiyon (BasicNewsRecipe):
title = u'Aksiyon Dergisi'
__author__ = u'thomass'
description = 'Haftalık haber dergisi '
oldest_article =13
max_articles_per_feed =100
no_stylesheets = True
#delay = 1
#use_embedded_content = False
encoding = 'utf-8'
publisher = 'Aksiyon'
category = 'news, haberler,TR,gazete'
language = 'tr'
publication_type = 'magazine'
#extra_css = ' body{ font-family: Verdana,Helvetica,Arial,sans-serif } .introduction{font-weight: bold} .story-feature{display: block; padding: 0; border: 1px solid; width: 40%; font-size: small} .story-feature h2{text-align: center; text-transform: uppercase} '
#keep_only_tags = [dict(name='font', attrs={'class':['newsDetail','agenda2NewsSpot']}),dict(name='span', attrs={'class':['agenda2Title']}),dict(name='div', attrs={'id':['gallery']})]
remove_tags = [dict(name='img', attrs={'src':[ 'http://medya.aksiyon.com.tr/aksiyon/images/logo/logo.bmp','/aksiyon/images/template/green/baslik0.gif','mobile/home.jpg']}) ]
cover_img_url = 'http://www.aksiyon.com.tr/aksiyon/images/aksiyon/top-page/aksiyon_top_r2_c1.jpg'
masthead_url = 'http://aksiyon.com.tr/aksiyon/images/aksiyon/top-page/aksiyon_top_r2_c1.jpg'
remove_empty_feeds= True
remove_attributes = ['width','height']
feeds = [
( u'ANASAYFA', u'http://www.aksiyon.com.tr/aksiyon/rss?sectionId=0'),
( u'KARAKUTU', u'http://www.aksiyon.com.tr/aksiyon/rss?sectionId=11'),
( u'EKONOMİ', u'http://www.aksiyon.com.tr/aksiyon/rss?sectionId=35'),
( u'EKOANALİZ', u'http://www.aksiyon.com.tr/aksiyon/rss?sectionId=284'),
( u'YAZARLAR', u'http://www.aksiyon.com.tr/aksiyon/rss?sectionId=17'),
( u'KİTAPLIK', u'http://www.aksiyon.com.tr/aksiyon/rss?sectionId=13'),
( u'SİNEMA', u'http://www.aksiyon.com.tr/aksiyon/rss?sectionId=14'),
( u'ARKA PENCERE', u'http://www.aksiyon.com.tr/aksiyon/rss?sectionId=27'),
( u'DÜNYA', u'http://www.aksiyon.com.tr/aksiyon/rss?sectionId=32'),
( u'DOSYALAR', u'http://www.aksiyon.com.tr/aksiyon/rss?sectionId=34'),
( u'KÜLTÜR & SANAT', u'http://www.aksiyon.com.tr/aksiyon/rss?sectionId=12'),
( u'KAPAK', u'http://www.aksiyon.com.tr/aksiyon/rss?sectionId=26'),
( u'SPOR', u'http://www.aksiyon.com.tr/aksiyon/rss?sectionId=38'),
( u'BİLİŞİM - TEKNOLOJİ', u'http://www.aksiyon.com.tr/aksiyon/rss?sectionId=39'),
( u'3. BOYUT', u'http://www.aksiyon.com.tr/aksiyon/rss?sectionId=172'),
( u'HAYAT BİLGİSİ', u'http://www.aksiyon.com.tr/aksiyon/rss?sectionId=283'),
( u'İŞ DÜNYASI', u'http://www.aksiyon.com.tr/aksiyon/rss?sectionId=283'),
]
def print_version(self, url):
return url.replace('http://www.aksiyon.com.tr/aksiyon/newsDetail_getNewsById.action?load=detay&', 'http://www.aksiyon.com.tr/aksiyon/mobile_detailn.action?')

View File

@ -0,0 +1,17 @@
__copyright__ = '2011, Pablo Aldama <pabloaldama at gmail.com>'
from calibre.web.feeds.news import BasicNewsRecipe
class AdvancedUserRecipe1311839910(BasicNewsRecipe):
title = u'Caros Amigos'
oldest_article = 20
max_articles_per_feed = 100
language = 'pt_BR'
__author__ = 'Pablo Aldama'
feeds = [(u'Caros Amigos', u'http://carosamigos.terra.com.br/index/index.php?format=feed&type=rss')]
keep_only_tags = [dict(name='div', attrs={'class':['blog']})
,dict(name='div', attrs={'class':['blogcontent']})
]
remove_tags = [dict(name='div', attrs={'class':'addtoany'})]

View File

@ -0,0 +1,23 @@
from calibre.web.feeds.news import BasicNewsRecipe
class AdvancedUserRecipe1312361378(BasicNewsRecipe):
title = u'Carta capital'
__author__ = 'Pablo Aldama'
language = 'pt_BR'
oldest_article = 9
max_articles_per_feed = 100
feeds = [(u'Politica', u'http://www.cartacapital.com.br/category/politica/feed')
,(u'Economia', u'http://www.cartacapital.com.br/category/economia/feed')
,(u'Cultura', u'http://www.cartacapital.com.br/category/cultura/feed')
,(u'Internacional', u'http://www.cartacapital.com.br/category/internacional/feed')
,(u'Saude', u'http://www.cartacapital.com.br/category/saude/feed')
,(u'Sociedade', u'http://www.cartacapital.com.br/category/sociedade/feed')
,(u'Tecnologia', u'http://www.cartacapital.com.br/category/tecnologia/feed')
,(u'Carta na escola', u'http://www.cartacapital.com.br/category/carta-na-escola/feed')
,(u'Carta fundamental', u'http://www.cartacapital.com.br/category/carta-fundamental/feed')
,(u'Carta verde', u'http://www.cartacapital.com.br/category/carta-verde/feed')
]
def print_version(self, url):
return url + '/print'

View File

@ -1,39 +1,34 @@
# -*- coding: utf-8 -*-
__license__ = 'GPLv3'
from calibre.web.feeds.news import BasicNewsRecipe
class AdvancedUserRecipe1255797795(BasicNewsRecipe):
title = u'Corren'
language = 'sv'
class AdvancedUserRecipe1311446032(BasicNewsRecipe):
title = 'Corren'
__author__ = 'Jonas Svensson'
simultaneous_downloads = 1
no_stylesheets = True
oldest_article = 7
description = 'News from Sweden'
publisher = 'Corren'
category = 'news, politics, Sweden'
oldest_article = 2
delay = 1
max_articles_per_feed = 100
remove_attributes = ['onload']
timefmt = ''
no_stylesheets = True
use_embedded_content = False
encoding = 'iso-8859-1'
language = 'sv'
feeds = [
(u'Toppnyheter (alla kategorier)', u'http://www.corren.se/inc/RssHandler.ashx?id=4122151&ripurl=http://www.corren.se/nyheter/'),
(u'Bostad', u'http://www.corren.se/inc/RssHandler.ashx?id=4122174&ripurl=http://www.corren.se/bostad/'),
(u'Ekonomi & Jobb', u'http://www.corren.se/inc/RssHandler.ashx?id=4122176&ripurl=http://www.corren.se/ekonomi/'),
(u'Kultur & Nöje', u'http://www.corren.se/inc/RssHandler.ashx?id=4122192&ripurl=http://www.corren.se/kultur/'),
(u'Mat & dryck', u'http://www.corren.se/inc/RssHandler.ashx?id=4122201&ripurl=http://www.corren.se/mat-dryck/'),
(u'Motor', u'http://www.corren.se/inc/RssHandler.ashx?id=4122203&ripurl=http://www.corren.se/motor/'),
(u'Sport', u'http://www.corren.se/inc/RssHandler.ashx?id=4122206&ripurl=http://www.corren.se/sport/'),
(u'Åsikter', u'http://www.corren.se/inc/RssHandler.ashx?id=4122223&ripurl=http://www.corren.se/asikter/'),
(u'Mjölby', u'http://www.corren.se/inc/RssHandler.ashx?id=4122235&ripurl=http://www.corren.se/ostergotland/mjolby/'),
(u'Motala', u'http://www.corren.se/inc/RssHandler.ashx?id=4122236&ripurl=http://www.corren.se/ostergotland/motala/')
(u'Toppnyheter', u'http://www.corren.se/inc/RssHandler.ashx?id=4122151&ripurl=http://www.corren.se/nyheter/')
,(u'Ekonomi', u'http://www.corren.se/inc/RssHandler.ashx?id=4122176&ripurl=http://www.corren.se/ekonomi/')
,(u'Link\xf6ping', u'http://www.corren.se/inc/RssHandler.ashx?id=4122234')
,(u'Åsikter', u'http://www.corren.se/inc/RssHandler.ashx?id=4122223,4122224,4122226,4122227,4122228,4122229,4122230')
]
def print_version(self, url):
url = url.replace("ekonomi/artikel.aspx", "Print.aspx")
url = url.replace("bostad/artikel.aspx", "Print.aspx")
url = url.replace("kultur/artikel.aspx", "Print.aspx")
url = url.replace("motor/artikel.aspx", "Print.aspx")
url = url.replace("mat-dryck/artikel.aspx", "Print.aspx")
url = url.replace("sport/artikel.aspx", "Print.aspx")
url = url.replace("asikter/artikel.aspx", "Print.aspx")
url = url.replace("mat-dryck/artikel.aspx", "Print.aspx")
url = url.replace("ostergotland/mjolby/artikel.aspx", "Print.aspx")
url = url.replace("ostergotland/motala/artikel.aspx", "Print.aspx")
return url.replace("nyheter/artikel.aspx", "Print.aspx")
keep_only_tags = [dict(name='div', attrs={'id':'article'}),dict(name='div', attrs={'class':'body'})]
remove_tags = [
dict(name='ul',attrs={'class':'functions'})
,dict(name='a',attrs={'href':'javascript*'})
,dict(name='div',attrs={'class':'box'})
,dict(name='div',attrs={'class':'functionsbottom'})
]

View File

@ -0,0 +1,40 @@
import re
from lxml.html import parse
from calibre.web.feeds.news import BasicNewsRecipe
class Counterpunch(BasicNewsRecipe):
'''
Parses counterpunch.com for articles
'''
title = 'Counterpunch'
description = 'Daily political opinion from www.Counterpunch.com'
language = 'en'
__author__ = 'O. Emmerson'
keep_only_tags = [dict(name='td', attrs={'width': '522'})]
max_articles_per_feed = 10
def parse_index(self):
feeds = []
title, url = 'Counterpunch', 'http://www.counterpunch.com'
articles = self.parse_page(url)
if articles:
feeds.append((title, articles))
return feeds
def parse_page(self, url):
parsed_page = parse(url).getroot()
articles = []
unwanted_text = re.compile('Website\ of\ the|I\ urge\ you|Subscribe\ now|DONATE|\@asis\.com|donation\ button|click\ over\ to\ our')
parsed_articles = [a for a in parsed_page.cssselect("html>body>table tr>td>p[class='style2']") if not unwanted_text.search(a.text_content())]
for art in parsed_articles:
try:
author = art.text
title = art.cssselect("a")[0].text + ' by {0}'.format(author)
art_url = 'http://www.counterpunch.com/' + art.cssselect("a")[0].attrib['href']
articles.append({'title': title, 'url': art_url})
except Exception as e:
e
#print('Handler Error: ', e, 'title :', a.text_content())
pass
return articles

View File

@ -0,0 +1,32 @@
# -*- coding: utf-8 -*-
__license__ = 'GPLv3'
from calibre.web.feeds.news import BasicNewsRecipe
class AdvancedUserRecipe1311450855(BasicNewsRecipe):
title = u'Dagens Industri'
__author__ = 'Jonas Svensson'
description = 'Economy news from Sweden'
publisher = 'DI'
category = 'news, politics, Sweden'
oldest_article = 2
delay = 1
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
encoding = 'utf-8'
language = 'sv'
feeds = [(u'DI', u'http://di.se/rss')]
keep_only_tags = [dict(name='h1', attrs={'id':'ctl00_ExtraWideContentRegion_WideContentRegion_MainRegion_MainContentRegion_MainBodyRegion_headlineNormal'}),dict(name='div', attrs={'id':'articleBody'})]
remove_tags = [
dict(name='div',attrs={'class':'article-actions clear'})
,dict(name='div',attrs={'class':'article-action-popup'})
,dict(name='div',attrs={'class':'header'})
,dict(name='div',attrs={'class':'content clear'})
,dict(name='div',attrs={'id':'articleAdvertisementDiv'})
,dict(name='ul',attrs={'class':'action-list'})
]

98
recipes/dnevnik_mk.recipe Normal file
View File

@ -0,0 +1,98 @@
#!/usr/bin/env python
__author__ = 'Darko Spasovski'
__license__ = 'GPL v3'
__copyright__ = '2011, Darko Spasovski <darko.spasovski at gmail.com>'
'''
dnevnik.com.mk
'''
import re
import datetime
from calibre.web.feeds.news import BasicNewsRecipe
from calibre import browser
from calibre.ebooks.BeautifulSoup import BeautifulSoup
class Dnevnik(BasicNewsRecipe):
INDEX = 'http://www.dnevnik.com.mk'
__author__ = 'Darko Spasovski'
title = 'Dnevnik - mk'
description = 'Daily Macedonian newspaper'
masthead_url = 'http://www.dnevnik.com.mk/images/re-logo.gif'
language = 'mk'
publication_type = 'newspaper'
category = 'news, Macedonia'
max_articles_per_feed = 100
remove_javascript = True
no_stylesheets = True
use_embedded_content = False
preprocess_regexps = [(re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in
[
## Remove anything before the start of the article.
(r'<body.*?<\?xml version=\"1.0\"\?><!--Article start-->', lambda match: '<body>'),
## Remove anything after the end of the article.
(r'<!--Article end.*?</body>', lambda match : '</body>'),
]
]
extra_css = """
body{font-family: Arial,Helvetica,sans-serif}
.WB_DNEVNIK_Naslov{FONT-WEIGHT: bold; FONT-SIZE: 18px; FONT-FAMILY: Arial, Verdana, Tahoma; TEXT-DECORATION: none}
"""
conversion_options = {
'comment' : description,
'tags' : category,
'language' : language,
'linearize_tables' : True
}
def parse_index(self):
datum = datetime.datetime.today().strftime('%d.%m.%Y')
soup = self.index_to_soup(self.INDEX + '/default.asp?section=arhiva&arhDatum=' + datum)
feeds = []
for section in soup.findAll('td', attrs={'class':'WB_DNEVNIK_ArhivaFormTitle'}):
sectionTitle = section.contents[0].string
if sectionTitle.lower().startswith('online'):
# Skip online articles
continue
containerTable = section.findPrevious(name='table').findNextSibling(name='table')
if containerTable==None:
print 'No container table found - page layout may have been changed.'
continue
articles = []
for article in containerTable.findAll('a', attrs={'class': 'WB_DNEVNIK_ArhivaFormText'}):
title = self.tag_to_string(article, use_alt=True).strip()
articles.append({'title': title, 'url':'http://www.dnevnik.com.mk/' + article['href'], 'description':'', 'date':''})
if articles:
feeds.append((sectionTitle, articles))
return sorted(feeds, key=lambda section: self.get_weight(section))
def get_weight(self, section):
"""
Returns 'weight' of a section.
Used for sorting the sections based on their 'natural' order in the printed edition.
"""
natural_order = { u'во фокусот': 1, u'актуелно': 2, u'економија': 3,
u'отворена': 4, u'свет': 5, u'интервју': 6, u'џубокс': 7,
u'репортажа': 8, u'наш туризам': 9, u'живот': 10,
u'автомобилизам': 11, u'спорт': 12, u'омнибус': 13 }
if section[0].string.lower() in natural_order:
return natural_order[section[0].string.lower()]
else:
return 999 # section names not on the list go to the bottom
def get_cover_url(self):
datum = datetime.datetime.today().strftime('%d.%m.%Y')
soup = self.index_to_soup(self.INDEX + '/default.asp?section=arhiva&arhDatum=' + datum)
anchor = soup.find('a', attrs={'class': 'WB_DNEVNIK_MoreLink'})
if anchor != None:
raw = browser().open_novisit(self.INDEX + '/' + anchor['href']).read()
cover_soup = BeautifulSoup(raw)
url = cover_soup.find('div', attrs={'class':'WB_DNEVNIK_Datum2'}).findNext('img')['src']
return self.INDEX + '/' + url
return ''

View File

@ -6,10 +6,10 @@ __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
economist.com
'''
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup
from calibre.ebooks.BeautifulSoup import Tag, NavigableString
from collections import OrderedDict
import string, time, re
import time, re
class Economist(BasicNewsRecipe):
@ -22,10 +22,12 @@ class Economist(BasicNewsRecipe):
' perspective. Best downloaded on Friday mornings (GMT)')
extra_css = '.headline {font-size: x-large;} \n h2 { font-size: small; } \n h1 { font-size: medium; }'
oldest_article = 7.0
cover_url = 'http://www.economist.com/images/covers/currentcoverus_large.jpg'
cover_url = 'http://media.economist.com/sites/default/files/imagecache/print-cover-thumbnail/print-covers/currentcoverus_large.jpg'
#cover_url = 'http://www.economist.com/images/covers/currentcoverus_large.jpg'
remove_tags = [
dict(name=['script', 'noscript', 'title', 'iframe', 'cf_floatingcontent']),
dict(attrs={'class':['dblClkTrk', 'ec-article-info', 'share_inline_header']}),
dict(attrs={'class':['dblClkTrk', 'ec-article-info',
'share_inline_header', 'related-items']}),
{'class': lambda x: x and 'share-links-header' in x},
]
keep_only_tags = [dict(id='ec-article-body')]
@ -67,52 +69,54 @@ class Economist(BasicNewsRecipe):
return self.economist_parse_index()
def economist_parse_index(self):
soup = BeautifulSoup(self.browser.open(self.INDEX).read(),
convertEntities=BeautifulSoup.HTML_ENTITIES)
index_started = False
feeds = {}
ans = []
key = None
for tag in soup.findAll(['h1', 'h2']):
text = ''.join(tag.findAll(text=True))
if tag.name in ('h1', 'h2') and 'Classified ads' in text:
break
if tag.name == 'h1':
if 'The world this week' in text or 'The world this year' in text:
index_started = True
if not index_started:
soup = self.index_to_soup(self.INDEX)
feeds = OrderedDict()
for section in soup.findAll(attrs={'class':lambda x: x and 'section' in
x}):
h4 = section.find('h4')
if h4 is None:
continue
text = string.capwords(text)
if text not in feeds.keys():
feeds[text] = []
if text not in ans:
ans.append(text)
key = text
section_title = self.tag_to_string(h4).strip()
if not section_title:
continue
if key is None:
self.log('Found section: %s'%section_title)
articles = []
for h5 in section.findAll('h5'):
article_title = self.tag_to_string(h5).strip()
if not article_title:
continue
a = tag.find('a', href=True)
data = h5.findNextSibling(attrs={'class':'article'})
if data is None: continue
a = data.find('a', href=True)
if a is None: continue
url = a['href']
if url.startswith('/'): url = 'http://www.economist.com'+url
url += '/print'
article_title += ': %s'%self.tag_to_string(a).strip()
articles.append({'title':article_title, 'url':url,
'description':'', 'date':''})
if not articles:
# We have last or first section
for art in section.findAll(attrs={'class':'article'}):
a = art.find('a', href=True)
if a is not None:
url = a['href']
id_ = re.search(r'story_id=(\d+)', url).group(1)
url = 'http://www.economist.com/node/%s/print'%id_
if url.startswith('Printer'):
url = '/'+url
if url.startswith('/'):
url = 'http://www.economist.com' + url
try:
subtitle = tag.previousSibling.contents[0].contents[0]
text = subtitle + ': ' + text
except:
pass
article = dict(title=text,
url = url,
description='', content='', date='')
feeds[key].append(article)
if url.startswith('/'): url = 'http://www.economist.com'+url
url += '/print'
title = self.tag_to_string(a)
if title:
articles.append({'title':title, 'url':url,
'description':'', 'date':''})
ans = [(key, feeds[key]) for key in ans if feeds.has_key(key)]
if articles:
feeds[section_title] = articles
ans = [(key, val) for key, val in feeds.iteritems()]
if not ans:
raise Exception('Could not find any articles. Has your subscription expired?')
raise Exception('Could not find any articles, either the '
'economist.com server is having trouble and you should '
'try later or the website format has changed and the '
'recipe needs to be updated.')
return ans
def eco_find_image_tables(self, soup):

View File

@ -16,11 +16,12 @@ class Economist(BasicNewsRecipe):
' Much slower than the print edition based version.')
extra_css = '.headline {font-size: x-large;} \n h2 { font-size: small; } \n h1 { font-size: medium; }'
oldest_article = 7.0
cover_url = 'http://www.economist.com/images/covers/currentcoverus_large.jpg'
cover_url = 'http://media.economist.com/sites/default/files/imagecache/print-cover-thumbnail/print-covers/currentcoverus_large.jpg'
#cover_url = 'http://www.economist.com/images/covers/currentcoverus_large.jpg'
remove_tags = [
dict(name=['script', 'noscript', 'title', 'iframe', 'cf_floatingcontent']),
dict(attrs={'class':['dblClkTrk', 'ec-article-info',
'share_inline_header']}),
'share_inline_header', 'related-items']}),
{'class': lambda x: x and 'share-links-header' in x},
]
keep_only_tags = [dict(id='ec-article-body')]

View File

@ -0,0 +1,58 @@
# -*- coding: utf-8 -*-
from calibre.web.feeds.news import BasicNewsRecipe
class AdvancedUserRecipe1311790237(BasicNewsRecipe):
title = u'Periódico El Colombiano'
language = 'es_CO'
__author__ = 'BIGO-CAVA'
cover_url = 'http://www.elcolombiano.com/images/logoElColombiano348x46.gif'
remove_tags_before = dict(id='contenidoArt')
remove_tags_after = dict(id='enviaTips')
remove_tags_after = dict(id='zonaPata')
oldest_article = 1
max_articles_per_feed = 100
remove_javascript = True
no_stylesheets = True
use_embedded_content = False
remove_empty_feeds = True
masthead_url = 'http://www.elcolombiano.com/images/logoElColombiano348x46.gif'
publication_type = 'newspaper'
extra_css = """
p{text-align: justify; font-size: 100%}
body{ text-align: left; font-size:100% }
h1{font-family: sans-serif; font-size:150%; font-weight:bold; text-align: justify; }
h3{font-family: sans-serif; font-size:100%; font-style: italic; text-align: justify; }
"""
feeds = [(u'Portada', u'http://www.elcolombiano.com/rss/portada.xml'),
(u'Antioquia', u'http://www.elcolombiano.com/rss/Antioquia.xml'),
(u'Colombia', u'http://www.elcolombiano.com/rss/Colombia.xml'),
(u'Economia', u'http://www.elcolombiano.com/rss/Economia.xml'),
(u'Internacional', u'http://www.elcolombiano.com/rss/Internacional.xml'),
(u'Politica', u'http://www.elcolombiano.com/rss/Politica.xml'),
(u'Cultura', u'http://www.elcolombiano.com/rss/Cultura.xml'),
(u'Entretenimiento', u'http://www.elcolombiano.com/rss/Farandula.xml'),
(u'Tecnologia', u'http://www.elcolombiano.com/rss/Tecnologia.xml'),
(u'Television', u'http://www.elcolombiano.com/rss/Television.xml'),
(u'Vida y Sociedad', u'http://www.elcolombiano.com/rss/Vida.xml'),
(u'Turismo', u'http://www.elcolombiano.com/rss/Turismo.xm'),
(u'Salud', u'http://www.elcolombiano.com/rss/Salud.xml'),
(u'Ciencia', u'http://www.elcolombiano.com/rss/Ciencia.xml')]
remove_tags = [dict(name='div', attrs={'class':'objetosRelacionados'}),
dict(name='div', attrs={'class':'notasRelacionadas contenedor'}),
dict(name='div', attrs={'class':'comentarios'}),
dict(name='div', attrs={'class':'mapaDelSitio'}),
dict(name='div', attrs={'class':'creditos'}),
dict(name='div', attrs={'class':'votos'}),
dict(name='div', attrs={'class':'divopt2'}),
dict(name='div', attrs={'class':'comentarios'}),
dict(name='div', attrs={'class':'pestanasLateral'}),
dict(name='div', attrs={'class':'resumenSeccion'}),
dict(name='div', attrs={'class':'zonaComercial'}),
dict(name='div', attrs={'id':'zonaPata'})]

53
recipes/el_tiempo.recipe Normal file
View File

@ -0,0 +1,53 @@
# -*- coding: utf-8 -*-
from calibre.web.feeds.news import BasicNewsRecipe
class ColombiaElTiempo02(BasicNewsRecipe):
title = u'Periódico el Tiempo'
language = 'es_CO'
__author__ = 'BIGO-CAVA'
cover_url = 'http://www.eltiempo.com/media/css/images/logo_footer.png'
remove_tags_before = dict(id='fb-root')
remove_tags_after = [dict(name='div', attrs={'class':'modulo reporte'})]
keep_only_tags = [dict(name='div', id='contenidoArt')]
remove_tags = [dict(name='div', attrs={'class':'social-media'}),
dict(name='div', attrs={'class':'caja-facebook'}),
dict(name='div', attrs={'class':'caja-twitter'}),
dict(name='div', attrs={'class':'caja-buzz'}),
dict(name='div', attrs={'class':'ico-mail2'}),
dict(name='div', attrs={'id':'caja-instapaper'}),
dict(name='div', attrs={'class':'modulo herramientas'})]
oldest_article = 2
max_articles_per_feed = 100
remove_javascript = True
no_stylesheets = True
use_embedded_content = False
remove_empty_feeds = True
masthead_url = 'http://www.eltiempo.com/media/css/images/logo_footer.png'
publication_type = 'newspaper'
extra_css = """
p{text-align: justify; font-size: 100%}
body{ text-align: left; font-size:100% }
h1{font-family: sans-serif; font-size:150%; font-weight:bold; text-align: justify; }
h3{font-family: sans-serif; font-size:100%; font-style: italic; text-align: justify; }
"""
feeds = [(u'Colombia', u'http://www.eltiempo.com/colombia/rss.xml'),
(u'Medellin', u'http://www.eltiempo.com/colombia/medellin/rss.xml'),
(u'Economia', u'http://www.eltiempo.com/economia/rss.xml'),
(u'Deportes', u'http://www.eltiempo.com/deportes/rss.xml'),
(u'Mundo', u'http://www.eltiempo.com/mundo/rss.xml'),
(u'Gente', u'http://www.eltiempo.com/gente/rss.xml'),
(u'Vida de Hoy', u'http://www.eltiempo.com/vida-de-hoy/rss.xml'),
(u'EEUU', u'http://www.eltiempo.com/mundo/estados-unidos/rss.xml'),
(u'LatinoAmerica', u'http://www.eltiempo.com/mundo/latinoamerica/rss.xml'),
(u'Europa', u'http://www.eltiempo.com/mundo/europa/rss.xml'),
(u'Medio Oriente', u'http://www.eltiempo.com/mundo/medio-oriente/rss.xml'),
(u'Vive in Medellin', u'http://medellin.vive.in/medellin/rss.xml'),
(u'Don Juan', u'http://www.revistadonjuan.com/feedrss/'),
(u'Alo', u'http://www.eltiempo.com/alo/rss.xml')]

View File

@ -1,25 +1,29 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2009, Kovid Goyal kovid@kovidgoyal.net'
__copyright__ = '2011, Starson17'
__docformat__ = 'restructuredtext en'
from calibre.web.feeds.news import BasicNewsRecipe
class Freakonomics(BasicNewsRecipe):
title = 'Freakonomics Blog'
description = 'The Hidden side of everything'
__author__ = 'Starson17'
__version__ = '1.02'
__date__ = '11 July 2011'
language = 'en'
cover_url = 'http://ilkerugur.files.wordpress.com/2009/04/freakonomics.jpg'
use_embedded_content= False
no_stylesheets = True
oldest_article = 30
remove_javascript = True
remove_empty_feeds = True
max_articles_per_feed = 50
feeds = [('Blog', 'http://feeds.feedburner.com/freakonomicsblog')]
keep_only_tags = [dict(name='div', attrs={'id':'header'}),
dict(name='h1'),
dict(name='h2'),
dict(name='div', attrs={'class':'entry-content'}),
]
feeds = [(u'Freakonomics Blog', u'http://www.freakonomics.com/feed/')]
keep_only_tags = [dict(name='div', attrs={'id':['content']})]
remove_tags_after = [dict(name='div', attrs={'class':['simple_socialmedia']})]
remove_tags = [dict(name='div', attrs={'class':['simple_socialmedia','single-fb-share','wp-polls']})]
extra_css = '''
h1{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:large;}
h2{font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:small;}

View File

@ -1,5 +1,4 @@
# -*- coding: utf-8 -*-
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.web.feeds import Feed
@ -46,4 +45,3 @@ class GC_gl(BasicNewsRecipe):
}
newArticles.append(newArt)
masterFeed.append((feed.title,newArticles))

View File

@ -12,7 +12,7 @@ from datetime import date
class Guardian(BasicNewsRecipe):
title = u'The Guardian / The Observer'
title = u'The Guardian and The Observer'
if date.today().weekday() == 6:
base_url = "http://www.guardian.co.uk/theobserver"
else:

BIN
recipes/icons/losandes.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 285 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 894 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 119 B

33
recipes/idg_se.recipe Normal file
View File

@ -0,0 +1,33 @@
__license__ = 'GPLv3'
from calibre.web.feeds.news import BasicNewsRecipe
class IDGse(BasicNewsRecipe):
title = 'IDG'
description = 'IDG.se'
language = 'se'
__author__ = 'zapt0'
oldest_article = 1
max_articles_per_feed = 40
no_stylesheets = True
encoding = 'ISO-8859-1'
remove_javascript = True
feeds = [(u'Senaste nytt',u'http://feeds.idg.se/idg/vzzs')]
def print_version(self,url):
return url + '?articleRenderMode=print&m=print'
def get_cover_url(this):
return 'http://idgmedia.idg.se/polopoly_fs/2.3275!images/idgmedia_logo_75.jpg'
keep_only_tags = [
dict(name='h1'),
dict(name='div', attrs={'class':['divColumn1Article']}),
]
#remove ads
remove_tags = [
dict(name='div', attrs={'id':['preamble_ad']}),
dict(name='ul', attrs={'class':['share']})
]

View File

@ -1,4 +1,3 @@
from calibre import strftime
from calibre.web.feeds.news import BasicNewsRecipe
class AdvancedUserRecipe1299694372(BasicNewsRecipe):
@ -9,14 +8,24 @@ class AdvancedUserRecipe1299694372(BasicNewsRecipe):
oldest_article = 365
max_articles_per_feed = 100
no_stylesheets = True
remove_javascript = True
remove_tags = [
dict(name='div', attrs={'id':'text_controls_toggle'})
,dict(name='script')
,dict(name='div', attrs={'id':'text_controls'})
,dict(name='div', attrs={'id':'editing_controls'})
,dict(name='div', attrs={'class':'bar bottom'})
]
use_embedded_content = False
needs_subscription = True
INDEX = u'http://www.instapaper.com'
LOGIN = INDEX + u'/user/login'
feeds = [(u'Instapaper Unread', u'http://www.instapaper.com/u'), (u'Instapaper Starred', u'http://www.instapaper.com/starred')]
feeds = [
(u'Instapaper Unread', u'http://www.instapaper.com/u'),
(u'Instapaper Starred', u'http://www.instapaper.com/starred')
]
def get_browser(self):
br = BasicNewsRecipe.get_browser()
@ -34,21 +43,28 @@ class AdvancedUserRecipe1299694372(BasicNewsRecipe):
lfeeds = self.get_feeds()
for feedobj in lfeeds:
feedtitle, feedurl = feedobj
self.report_progress(0, _('Fetching feed')+' %s...'%(feedtitle if feedtitle else feedurl))
self.report_progress(0, 'Fetching feed'+' %s...'%(feedtitle if feedtitle else feedurl))
articles = []
soup = self.index_to_soup(feedurl)
for item in soup.findAll('div', attrs={'class':'titleRow'}):
description = self.tag_to_string(item.div)
for item in soup.findAll('div', attrs={'class':'cornerControls'}):
#description = self.tag_to_string(item.div)
atag = item.a
if atag and atag.has_key('href'):
url = atag['href']
title = self.tag_to_string(atag)
date = strftime(self.timefmt)
articles.append({
'title' :title
,'date' :date
,'url' :url
,'description':description
'url' :url
})
totalfeeds.append((feedtitle, articles))
return totalfeeds
def print_version(self, url):
return 'http://www.instapaper.com' + url
def populate_article_metadata(self, article, soup, first):
article.title = soup.find('title').contents[0].strip()
def postprocess_html(self, soup, first_fetch):
for link_tag in soup.findAll(attrs={"id" : "story"}):
link_tag.insert(0,'<h1>'+soup.find('title').contents[0].strip()+'</h1>')
return soup

View File

@ -16,16 +16,14 @@ class i09(BasicNewsRecipe):
max_articles_per_feed = 100
no_stylesheets = True
encoding = 'utf-8'
use_embedded_content = False
use_embedded_content = True
language = 'en'
masthead_url = 'http://cache.gawkerassets.com/assets/io9.com/img/logo.png'
extra_css = '''
body{font-family: "Lucida Grande",Helvetica,Arial,sans-serif}
img{margin-bottom: 1em}
h1{font-family :Arial,Helvetica,sans-serif; font-size:large}
h2{font-family :Arial,Helvetica,sans-serif; font-size:x-small}
'''
conversion_options = {
'comment' : description
, 'tags' : category
@ -33,13 +31,11 @@ class i09(BasicNewsRecipe):
, 'language' : language
}
remove_attributes = ['width','height']
keep_only_tags = [dict(attrs={'class':'content permalink'})]
remove_tags_before = dict(name='h1')
remove_tags = [dict(attrs={'class':'contactinfo'})]
remove_tags_after = dict(attrs={'class':'contactinfo'})
feeds = [(u'Articles', u'http://feeds.gawker.com/io9/vip?format=xml')]
feeds = [(u'Articles', u'http://feeds.gawker.com/io9/full')]
remove_tags = [
{'class': 'feedflare'},
]
def preprocess_html(self, soup):
return self.adeify_images(soup)

View File

@ -18,6 +18,7 @@ class IrishTimes(BasicNewsRecipe):
oldest_article = 1.0
max_articles_per_feed = 100
no_stylesheets = True
simultaneous_downloads= 5
r = re.compile('.*(?P<url>http:\/\/(www.irishtimes.com)|(rss.feedsportal.com\/c)\/.*\.html?).*')
remove_tags = [dict(name='div', attrs={'class':'footer'})]
@ -25,17 +26,17 @@ class IrishTimes(BasicNewsRecipe):
feeds = [
('Frontpage', 'http://www.irishtimes.com/feeds/rss/newspaper/index.rss'),
('Ireland', 'http://rss.feedsportal.com/c/851/f/10845/index.rss'),
('World', 'http://rss.feedsportal.com/c/851/f/10846/index.rss'),
('Finance', 'http://rss.feedsportal.com/c/851/f/10847/index.rss'),
('Features', 'http://rss.feedsportal.com/c/851/f/10848/index.rss'),
('Sport', 'http://rss.feedsportal.com/c/851/f/10849/index.rss'),
('Opinion', 'http://rss.feedsportal.com/c/851/f/10850/index.rss'),
('Letters', 'http://rss.feedsportal.com/c/851/f/10851/index.rss'),
('Ireland', 'http://www.irishtimes.com/feeds/rss/newspaper/ireland.rss'),
('World', 'http://www.irishtimes.com/feeds/rss/newspaper/world.rss'),
('Finance', 'http://www.irishtimes.com/feeds/rss/newspaper/finance.rss'),
('Features', 'http://www.irishtimes.com/feeds/rss/newspaper/features.rss'),
('Sport', 'http://www.irishtimes.com/feeds/rss/newspaper/sport.rss'),
('Opinion', 'http://www.irishtimes.com/feeds/rss/newspaper/opinion.rss'),
('Letters', 'http://www.irishtimes.com/feeds/rss/newspaper/letters.rss'),
('Magazine', 'http://www.irishtimes.com/feeds/rss/newspaper/magazine.rss'),
('Health', 'http://rss.feedsportal.com/c/851/f/10852/index.rss'),
('Education & Parenting', 'http://rss.feedsportal.com/c/851/f/10853/index.rss'),
('Motors', 'http://rss.feedsportal.com/c/851/f/10854/index.rss'),
('Health', 'http://www.irishtimes.com/feeds/rss/newspaper/health.rss'),
('Education & Parenting', 'http://www.irishtimes.com/feeds/rss/newspaper/education.rss'),
('Motors', 'http://www.irishtimes.com/feeds/rss/newspaper/motors.rss'),
('An Teanga Bheo', 'http://www.irishtimes.com/feeds/rss/newspaper/anteangabheo.rss'),
('Commercial Property', 'http://www.irishtimes.com/feeds/rss/newspaper/commercialproperty.rss'),
('Science Today', 'http://www.irishtimes.com/feeds/rss/newspaper/sciencetoday.rss'),
@ -49,10 +50,16 @@ class IrishTimes(BasicNewsRecipe):
def print_version(self, url):
if url.count('rss.feedsportal.com'):
u = url.replace('0Bhtml/story01.htm','_pf0Bhtml/story01.htm')
#u = url.replace('0Bhtml/story01.htm','_pf0Bhtml/story01.htm')
u = url.find('irishtimes')
u = 'http://www.irishtimes.com' + url[u + 12:]
u = u.replace('0C', '/')
u = u.replace('A', '')
u = u.replace('0Bhtml/story01.htm', '_pf.html')
else:
u = url.replace('.html','_pf.html')
return u
def get_article_url(self, article):
return article.link

View File

@ -1,4 +1,4 @@
import urllib2
import urllib2, re
from calibre.web.feeds.news import BasicNewsRecipe
class JBPress(BasicNewsRecipe):
@ -40,3 +40,12 @@ class JBPress(BasicNewsRecipe):
def print_version(self, url):
url = urllib2.urlopen(url).geturl() # resolve redirect.
return url.replace('/-/', '/print/')
def preprocess_html(self, soup):
# remove breadcrumb
h3s = soup.findAll('h3')
for h3 in h3s:
if re.compile('^JBpress&gt;').match(h3.string):
h3.extract()
return soup

78
recipes/losandes.recipe Normal file
View File

@ -0,0 +1,78 @@
__license__ = 'GPL v3'
__copyright__ = '2011, Darko Miletic <darko.miletic at gmail.com>'
'''
www.losandes.com.ar
'''
from calibre import strftime
from calibre.web.feeds.news import BasicNewsRecipe
class LosAndes(BasicNewsRecipe):
title = 'Los Andes'
__author__ = 'Darko Miletic'
description = 'Noticias de Mendoza, Argentina y el resto del mundo'
publisher = 'Los Andes'
category = 'news, politics, Argentina'
oldest_article = 2
max_articles_per_feed = 200
no_stylesheets = True
encoding = 'cp1252'
use_embedded_content = False
language = 'es_AR'
remove_empty_feeds = True
publication_type = 'newspaper'
masthead_url = 'http://www.losandes.com.ar/graficos/losandes.png'
extra_css = """
body{font-family: Arial,Helvetica,sans-serif }
h1,h2{font-family: "Times New Roman",Times,serif}
.fechaNota{font-weight: bold; color: gray}
"""
conversion_options = {
'comment' : description
, 'tags' : category
, 'publisher' : publisher
, 'language' : language
}
remove_tags = [
dict(name=['meta','link'])
,dict(attrs={'class':['cabecera', 'url']})
]
remove_tags_before=dict(attrs={'class':'cabecera'})
remove_tags_after=dict(attrs={'class':'url'})
feeds = [
(u'Ultimas Noticias' , u'http://www.losandes.com.ar/servicios/rss.asp?r=78' )
,(u'Politica' , u'http://www.losandes.com.ar/servicios/rss.asp?r=68' )
,(u'Economia nacional' , u'http://www.losandes.com.ar/servicios/rss.asp?r=65' )
,(u'Economia internacional' , u'http://www.losandes.com.ar/servicios/rss.asp?r=505')
,(u'Internacionales' , u'http://www.losandes.com.ar/servicios/rss.asp?r=66' )
,(u'Turismo' , u'http://www.losandes.com.ar/servicios/rss.asp?r=502')
,(u'Fincas' , u'http://www.losandes.com.ar/servicios/rss.asp?r=504')
,(u'Isha nos habla' , u'http://www.losandes.com.ar/servicios/rss.asp?r=562')
,(u'Estilo' , u'http://www.losandes.com.ar/servicios/rss.asp?r=81' )
,(u'Cultura' , u'http://www.losandes.com.ar/servicios/rss.asp?r=503')
,(u'Policiales' , u'http://www.losandes.com.ar/servicios/rss.asp?r=70' )
,(u'Deportes' , u'http://www.losandes.com.ar/servicios/rss.asp?r=69' )
,(u'Sociedad' , u'http://www.losandes.com.ar/servicios/rss.asp?r=67' )
,(u'Opinion' , u'http://www.losandes.com.ar/servicios/rss.asp?r=80' )
,(u'Editorial' , u'http://www.losandes.com.ar/servicios/rss.asp?r=76' )
,(u'Mirador' , u'http://www.losandes.com.ar/servicios/rss.asp?r=79' )
]
def print_version(self, url):
artid = url.rpartition('.')[0].rpartition('-')[2]
return "http://www.losandes.com.ar/includes/modulos/imprimir.asp?tipo=noticia&id=" + artid
def get_cover_url(self):
month = strftime("%m").lstrip('0')
day = strftime("%d").lstrip('0')
year = strftime("%Y")
return "http://www.losandes.com.ar/fotografias/fotosnoticias/" + year + "/" + month + "/" + day + "/th_tapa.jpg"
def preprocess_html(self, soup):
for item in soup.findAll(style=True):
del item['style']
return soup

View File

@ -0,0 +1,44 @@
# -*- coding: utf-8 -*-
from calibre.web.feeds.news import BasicNewsRecipe
class LV_gl(BasicNewsRecipe):
title = u'De Luns a Venres (RSS)'
__author__ = u'Susana Sotelo Docío'
description = u'O gratuíto galego'
publisher = u'Galiciaé'
category = u'news'
encoding = 'utf-8'
language = 'gl'
direction = 'ltr'
cover_url = 'http://lv.galiciae.com/new_estilos/lv/logo.gif'
oldest_article = 2
max_articles_per_feed = 200
center_navbar = False
feeds = [
(u'Galicia', u'http://lv.galiciae.com/cache/rss/sec_galicia_gl.rss'),
(u'Cultura', u'http://lv.galiciae.com/cache/rss/sec_cultura_gl.rss'),
(u'Mundo', u'http://lv.galiciae.com/cache/rss/sec_mundo_gl.rss'),
(u'Cidadanía', u'http://lv.galiciae.com/cache/rss/sec_ciudadania_gl.rss'),
(u'Tecnoloxía', u'http://lv.galiciae.com/cache/rss/sec_tecnologia_gl.rss'),
(u'España', u'http://lv.galiciae.com/cache/rss/sec_espana_gl.rss'),
(u'Deportes', u'http://lv.galiciae.com/cache/rss/sec_deportes_gl.rss'),
(u'Economía', u'http://lv.galiciae.com/cache/rss/sec_economia_gl.rss'),
(u'Lercheo', u'http://lv.galiciae.com/cache/rss/sec_gente_gl.rss'),
(u'Medio ambiente', u'http://lv.galiciae.com/cache/rss/sec_medioambiente_gl.rss'),
(u'España/Mundo', u'http://lv.galiciae.com/cache/rss/sec_espanamundo_gl.rss'),
(u'Sociedade', u'http://lv.galiciae.com/cache/rss/sec_sociedad_gl.rss'),
(u'Ciencia', u'http://lv.galiciae.com/cache/rss/sec_ciencia_gl.rss'),
(u'Motor', u'http://lv.galiciae.com/cache/rss/sec_motor_gl.rss'),
(u'Coches', u'http://lv.galiciae.com/cache/rss/sec_coches_gl.rss'),
(u'Motos', u'http://lv.galiciae.com/cache/rss/sec_motos_gl.rss'),
(u'Industriais', u'http://lv.galiciae.com/cache/rss/sec_industriales_gl.rss')
]
extra_css = u' p{text-align:left} '
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\nencoding="' + encoding + '"\ntags="' + category + '"\noverride_css=" p {text-align:left; text-indent: 0cm} "'
def print_version(self, url):
url += '?imprimir&lang=gl'
return url

View File

@ -0,0 +1,52 @@
# -*- coding: utf-8 -*-
__license__ = 'GPL v3'
__copyright__ = 'Marcin Urban 2011'
import re
from calibre.web.feeds.recipes import BasicNewsRecipe
class recipeMagic(BasicNewsRecipe):
title = 'National Geographic PL'
__author__ = 'Marcin Urban 2011'
description = 'legenda wśród magazynów z historią sięgającą 120 lat'
cover_url = 'http://www.guj.pl/var/guj/storage/images/media/nasze_magazyny/national_geographic/logo/ng_logo/2606-1-pol-PL/ng_logo.jpg'
oldest_article = 7
max_articles_per_feed = 100
no_stylesheets = True
#delay = 1
use_embedded_content = False
encoding = 'utf8'
publisher = 'G+J Gruner+Jahr Polska'
category = 'news, PL,'
language = 'pl'
publication_type = 'newsportal'
extra_css = ''' body {font-family: verdana, arial, helvetica, geneva, sans-serif ;}
h1{text-align: center;}
h2{font-size: medium; font-weight: bold;}
.authordate {font-size: small; color: #696969;}
p.lead {font-weight: bold; text-align: center;}
.fot{font-size: x-small; color: #666666;} '''
preprocess_regexps = [(re.compile(r'<!--.*?-->', re.DOTALL), lambda m: '')]
conversion_options = {
'comments' : description
,'tags' : category
,'language' : language
,'publisher' : publisher
,'linearize_tables': True
}
remove_tags = [
dict(name='div', attrs={'class':'add_inf'}),
dict(name='div', attrs={'class':'add_f'}),
]
remove_attributes = ['width','height']
feeds = [
('National Geographic PL', 'http://www.national-geographic.pl/rss/'),
]
def print_version(self, url):
return url.replace('artykuly0Cpokaz', 'drukuj-artykul')

View File

@ -1,11 +1,10 @@
EMAILADDRESS = 'hoge@foobar.co.jp'
from calibre.web.feeds.news import BasicNewsRecipe
class NBOnline(BasicNewsRecipe):
title = u'Nikkei Business Online'
language = 'ja'
description = u'Nikkei Business Online New articles. PLEASE NOTE: You need to edit EMAILADDRESS line of this "nbonline.recipe" file to set your e-mail address which is needed when login. (file is in "Calibre2/resources/recipes" directory.)'
description = u'Nikkei Business Online.\u6CE8\uFF1A\u30E6\u30FC\u30B6\u30FC\u540D\u306Bemail\u30A2\u30C9\u30EC\u30B9\u3068\u30E6\u30FC\u30B6\u30FC\u540D\u3092\u30BB\u30DF\u30B3\u30ED\u30F3\u3067\u533A\u5207\u3063\u3066\u5165\u308C\u3066\u304F\u3060\u3055\u3044\u3002\u4F8B\uFF1Aemail@address.jp;username . PLEASE NOTE: You need to put your email address and username into username filed separeted by ; (semi-colon).'
__author__ = 'Ado Nishimura'
needs_subscription = True
oldest_article = 7
@ -23,8 +22,8 @@ class NBOnline(BasicNewsRecipe):
if self.username is not None and self.password is not None:
br.open('https://signon.nikkeibp.co.jp/front/login/?ct=p&ts=nbo')
br.select_form(name='loginActionForm')
br['email'] = EMAILADDRESS
br['userId'] = self.username
br['email'] = self.username.split(';')[0]
br['userId'] = self.username.split(';')[1]
br['password'] = self.password
br.submit()
return br

View File

@ -0,0 +1,88 @@
from calibre.web.feeds.recipes import BasicNewsRecipe
import re
#import pprint, sys
#pp = pprint.PrettyPrinter(indent=4)
class NikkeiNet_paper_subscription(BasicNewsRecipe):
title = u'\u65E5\u672C\u7D4C\u6E08\u65B0\u805E\uFF08\u671D\u520A\u30FB\u5915\u520A\uFF09'
__author__ = 'Ado Nishimura'
description = u'\u65E5\u7D4C\u96FB\u5B50\u7248\u306B\u3088\u308B\u65E5\u672C\u7D4C\u6E08\u65B0\u805E\u3002\u671D\u520A\u30FB\u5915\u520A\u306F\u53D6\u5F97\u6642\u9593\u306B\u3088\u308A\u5207\u308A\u66FF\u308F\u308A\u307E\u3059\u3002\u8981\u8CFC\u8AAD'
needs_subscription = True
oldest_article = 1
max_articles_per_feed = 30
language = 'ja'
no_stylesheets = True
cover_url = 'http://parts.nikkei.com/parts/ds/images/common/logo_r1.svg'
masthead_url = 'http://parts.nikkei.com/parts/ds/images/common/logo_r1.svg'
remove_tags_before = {'class':"cmn-indent"}
remove_tags = [
# {'class':"cmn-article_move"},
# {'class':"cmn-pr_list"},
# {'class':"cmnc-zoom"},
{'class':"cmn-hide"},
{'name':'form'},
]
remove_tags_after = {'class':"cmn-indent"}
def get_browser(self):
br = BasicNewsRecipe.get_browser()
#pp.pprint(self.parse_index())
#exit(1)
#br.set_debug_http(True)
#br.set_debug_redirects(True)
#br.set_debug_responses(True)
if self.username is not None and self.password is not None:
print "----------------------------open top page----------------------------------------"
br.open('http://www.nikkei.com/')
print "----------------------------open first login form--------------------------------"
link = br.links(url_regex="www.nikkei.com/etc/accounts/login").next()
br.follow_link(link)
#response = br.response()
#print response.get_data()
print "----------------------------JS redirect(send autoPostForm)-----------------------"
br.select_form(name='autoPostForm')
br.submit()
#response = br.response()
print "----------------------------got login form---------------------------------------"
br.select_form(name='LA0210Form01')
br['LA0210Form01:LA0210Email'] = self.username
br['LA0210Form01:LA0210Password'] = self.password
br.submit()
#response = br.response()
print "----------------------------JS redirect------------------------------------------"
br.select_form(nr=0)
br.submit()
#br.set_debug_http(False)
#br.set_debug_redirects(False)
#br.set_debug_responses(False)
return br
def cleanup(self):
print "----------------------------logout-----------------------------------------------"
self.browser.open('https://regist.nikkei.com/ds/etc/accounts/logout')
def parse_index(self):
print "----------------------------get index of paper-----------------------------------"
result = []
soup = self.index_to_soup('http://www.nikkei.com/paper/')
#soup = self.index_to_soup(self.test_data())
for sect in soup.findAll('div', 'cmn-section kn-special JSID_baseSection'):
sect_title = sect.find('h3', 'cmnc-title').string
sect_result = []
for elem in sect.findAll(attrs={'class':['cmn-article_title']}):
url = 'http://www.nikkei.com' + elem.span.a['href']
url = re.sub("/article/", "/print-article/", url) # print version.
span = elem.span.a.span
if ((span is not None) and (len(span.contents) > 1)):
title = span.contents[1].string
sect_result.append(dict(title=title, url=url, date='',
description='', content=''))
result.append([sect_title, sect_result])
#pp.pprint(result)

47
recipes/plus_info.recipe Normal file
View File

@ -0,0 +1,47 @@
#!/usr/bin/env python
__author__ = 'Darko Spasovski'
__license__ = 'GPL v3'
__copyright__ = '2011, Darko Spasovski <darko.spasovski at gmail.com>'
'''
www.plusinfo.mk
'''
from calibre.web.feeds.news import BasicNewsRecipe
class PlusInfo(BasicNewsRecipe):
INDEX = 'www.plusinfo.mk'
title = u'+info'
__author__ = 'Darko Spasovski'
description = 'Macedonian news portal'
publication_type = 'newsportal'
category = 'news, Macedonia'
language = 'mk'
masthead_url = 'http://www.plusinfo.mk/style/images/logo.jpg'
remove_javascript = True
no_stylesheets = True
use_embedded_content = False
remove_empty_feeds = True
oldest_article = 1
max_articles_per_feed = 100
keep_only_tags = [dict(name='div', attrs={'class': 'vest'})]
remove_tags = [dict(name='div', attrs={'class':['komentari_holder', 'objava']})]
feeds = [(u'Македонија', u'http://www.plusinfo.mk/rss/makedonija'),
(u'Бизнис', u'http://www.plusinfo.mk/rss/biznis'),
(u'Скопје', u'http://www.plusinfo.mk/rss/skopje'),
(u'Култура', u'http://www.plusinfo.mk/rss/kultura'),
(u'Свет', u'http://www.plusinfo.mk/rss/svet'),
(u'Сцена', u'http://www.plusinfo.mk/rss/scena'),
(u'Здравје', u'http://www.plusinfo.mk/rss/zdravje'),
(u'Магазин', u'http://www.plusinfo.mk/rss/magazin'),
(u'Спорт', u'http://www.plusinfo.mk/rss/sport')]
# uncomment the following block if you want the print version (note: it lacks photos)
# def print_version(self,url):
# segments = url.split('/')
# printURL = '/'.join(segments[0:3]) + '/print/' + '/'.join(segments[5:])
# return printURL

36
recipes/portafolio.recipe Normal file
View File

@ -0,0 +1,36 @@
# -*- coding: utf-8 -*-
from calibre.web.feeds.news import BasicNewsRecipe
class AdvancedUserRecipe1311799898(BasicNewsRecipe):
title = u'Periódico Portafolio Colombia'
language = 'es_CO'
__author__ = 'BIGO-CAVA'
cover_url = 'http://www.portafolio.co/sites/portafolio.co/themes/portafolio_2011/logo.png'
remove_tags_before = dict(id='contenidoArt')
remove_tags_after = [dict(name='div', attrs={'class':'articulo-mas'})]
keep_only_tags = [dict(name='div', id='contenidoArt')]
oldest_article = 1
max_articles_per_feed = 100
remove_javascript = True
no_stylesheets = True
use_embedded_content = False
remove_empty_feeds = True
masthead_url = 'http://www.portafolio.co/sites/portafolio.co/themes/portafolio_2011/logo.png'
publication_type = 'newspaper'
extra_css = """
p{text-align: justify; font-size: 100%}
body{ text-align: left; font-size:100% }
h1{font-family: sans-serif; font-size:150%; font-weight:bold; text-align: justify; }
h3{font-family: sans-serif; font-size:100%; font-style: italic; text-align: justify; }
"""
feeds = [(u'Negocios', u'http://www.portafolio.co/negocios/feed'),
(u'Economia', u'http://www.portafolio.co/economia/feed'),
(u'Internacional', u'http://www.portafolio.co/internacional/feed'),
(u'Indicadores', u'http://www.portafolio.co/indicadores/feed'),
(u'Opinion', u'http://www.portafolio.co/opinion/feed'),
(u'Finanzas Personales', u'http://www.portafolio.co/finanzas-personales/feed'),
(u'Herramientas', u'http://www.portafolio.co/herramientas/feed')]

63
recipes/techcrunch.recipe Normal file
View File

@ -0,0 +1,63 @@
__license__ = 'GPL v3'
__copyright__ = '2011, Darko Miletic <darko.miletic at gmail.com>'
'''
techcrunch.com
'''
from calibre.web.feeds.news import BasicNewsRecipe
class TechCrunch(BasicNewsRecipe):
title = 'TechCrunch'
__author__ = 'Darko Miletic'
description = 'IT News'
publisher = 'AOL Inc.'
category = 'news, IT'
oldest_article = 2
max_articles_per_feed = 200
no_stylesheets = True
encoding = 'utf8'
use_embedded_content = False
language = 'en'
remove_empty_feeds = True
publication_type = 'newsportal'
masthead_url = 'http://s2.wp.com/wp-content/themes/vip/tctechcrunch2/images/site-logo.png'
extra_css = """
body{font-family: Helvetica,Arial,sans-serif }
img{margin-bottom: 0.4em; display:block}
"""
conversion_options = {
'comment' : description
, 'tags' : category
, 'publisher' : publisher
, 'language' : language
}
remove_tags = [dict(name=['meta','link'])]
remove_attributes=['lang']
keep_only_tags=[
dict(name='h1', attrs={'class':'headline'})
,dict(attrs={'class':['author','post-time','body-copy']})
]
feeds = [(u'News', u'http://feeds.feedburner.com/TechCrunch/')]
def preprocess_html(self, soup):
for item in soup.findAll(style=True):
del item['style']
for item in soup.findAll('a'):
limg = item.find('img')
if item.string is not None:
str = item.string
item.replaceWith(str)
else:
if limg:
item.name = 'div'
item.attrs = []
else:
str = self.tag_to_string(item)
item.replaceWith(str)
for item in soup.findAll('img'):
if not item.has_key('alt'):
item['alt'] = 'image'
return soup

24
recipes/tijolaco.recipe Normal file
View File

@ -0,0 +1,24 @@
from calibre.web.feeds.recipes import BasicNewsRecipe
class Tijolaco(BasicNewsRecipe):
title = u'Tijolaco.com'
__author__ = u'Diniz Bortolotto'
description = u'Posts do Blog Tijola\xe7o.com'
oldest_article = 7
max_articles_per_feed = 50
encoding = 'utf8'
publisher = u'Brizola Neto'
category = 'politics, Brazil'
language = 'pt_BR'
publication_type = 'politics portal'
use_embedded_content = False
no_stylesheets = True
remove_javascript = True
feeds = [(u'Blog Tijola\xe7o.com', u'http://feeds.feedburner.com/Tijolacoblog')]
reverse_article_order = True
keep_only_tags = [dict(name='div', attrs={'class':'post'})]
remove_tags = [dict(name='span', attrs={'class':'com'})]

View File

@ -8,47 +8,33 @@ time.com
import re
from calibre.web.feeds.news import BasicNewsRecipe
from lxml import html
class Time(BasicNewsRecipe):
#recipe_disabled = ('This recipe has been disabled as TIME no longer'
# ' publish complete articles on the web.')
title = u'Time'
__author__ = 'Kovid Goyal and Sujata Raman'
__author__ = 'Kovid Goyal'
description = 'Weekly magazine'
encoding = 'utf-8'
no_stylesheets = True
language = 'en'
remove_javascript = True
extra_css = ''' h1 {font-family:georgia,serif;color:#000000;}
.mainHd{font-family:georgia,serif;color:#000000;}
h2 {font-family:Arial,Sans-serif;}
.name{font-family:Arial,Sans-serif; font-size:x-small;font-weight:bold; }
.date{font-family:Arial,Sans-serif; font-size:x-small ;color:#999999;}
.byline{font-family:Arial,Sans-serif; font-size:x-small ;}
.photoBkt{ font-size:x-small ;}
.vertPhoto{font-size:x-small ;}
.credits{font-family:Arial,Sans-serif; font-size:x-small ;color:gray;}
.credit{font-family:Arial,Sans-serif; font-size:x-small ;color:gray;}
.artTxt{font-family:georgia,serif;}
#content{font-family:georgia,serif;}
.caption{font-family:georgia,serif; font-size:x-small;color:#333333;}
.credit{font-family:georgia,serif; font-size:x-small;color:#999999;}
a:link{color:#CC0000;}
.breadcrumb{font-family:Arial,Sans-serif;font-size:x-small;}
'''
keep_only_tags = [ dict(name ="div",attrs = {"id" :["content"]}) ,
dict(name ="div",attrs = {"class" :["artHd","artTxt","photoBkt","vertPhoto","image","copy"]}) ,]
remove_tags = [ dict(name ="div",attrs = {'class':['articleFooterNav','listsByTopic','articleTools2','relatedContent','sideContent','topBannerWrap','articlePagination','nextUp',"rtCol","pagination","enlarge","contentTools2",]}),
dict(name ="span",attrs = {'class':['see']}),
dict(name ="div",attrs = {'id':['header','articleSideBar',"articleTools","articleFooter","cmBotLt","quigoPackage"]}),
dict(name ="a",attrs = {'class':['listLink']}),
dict(name ="ul",attrs = {'id':['shareSocial','tabs']}),
dict(name ="li",attrs = {'class':['back']}),
dict(name ="ul",attrs = {'class':['navCount']}),
keep_only_tags = [
{
'class':['artHd', 'articleContent',
'entry-title','entry-meta', 'entry-content', 'thumbnail']
},
]
remove_tags = [
{'class':['content-tools', 'quigo', 'see',
'first-tier-social-tools', 'navigation', 'enlarge lightbox']},
{'id':['share-tools']},
{'rel':'lightbox'},
]
recursions = 10
match_regexps = [r'/[0-9,]+-(2|3|4|5|6|7|8|9)(,\d+){0,1}.html',r'http://www.time.com/time/specials/packages/article/.*']
@ -56,10 +42,11 @@ class Time(BasicNewsRecipe):
r'<meta .+/>'), lambda m:'')]
def parse_index(self):
soup = self.index_to_soup('http://www.time.com/time/magazine')
img = soup.find('a', title="View Large Cover", href=True)
if img is not None:
cover_url = 'http://www.time.com'+img['href']
raw = self.index_to_soup('http://www.time.com/time/magazine', raw=True)
root = html.fromstring(raw)
img = root.xpath('//a[.="View Large Cover" and @href]')
if img:
cover_url = 'http://www.time.com' + img[0].get('href')
try:
nsoup = self.index_to_soup(cover_url)
img = nsoup.find('img', src=re.compile('archive/covers'))
@ -70,46 +57,48 @@ class Time(BasicNewsRecipe):
feeds = []
parent = soup.find(id='tocGuts')
for seched in parent.findAll(attrs={'class':'toc_seched'}):
section = self.tag_to_string(seched).capitalize()
articles = list(self.find_articles(seched))
parent = root.xpath('//div[@class="content-main-aside"]')[0]
for sec in parent.xpath(
'descendant::section[contains(@class, "sec-mag-section")]'):
h3 = sec.xpath('./h3')
if h3:
section = html.tostring(h3[0], encoding=unicode,
method='text').strip().capitalize()
self.log('Found section', section)
articles = list(self.find_articles(sec))
if articles:
feeds.append((section, articles))
return feeds
def find_articles(self, seched):
for a in seched.findNextSiblings( attrs={'class':['toc_hed','rule2']}):
if a.name in "div":
break
else:
def find_articles(self, sec):
for article in sec.xpath('./article'):
h2 = article.xpath('./*[@class="entry-title"]')
if not h2: continue
a = h2[0].xpath('./a[@href]')
if not a: continue
title = html.tostring(a[0], encoding=unicode,
method='text').strip()
if not title: continue
url = a[0].get('href')
if url.startswith('/'):
url = 'http://www.time.com'+url
desc = ''
p = article.xpath('./*[@class="entry-content"]')
if p:
desc = html.tostring(p[0], encoding=unicode,
method='text')
self.log('\t', title, ':\n\t\t', desc)
yield {
'title' : self.tag_to_string(a),
'url' : 'http://www.time.com'+a['href'],
'title' : title,
'url' : url,
'date' : '',
'description' : self.article_description(a)
'description' : desc
}
def article_description(self, a):
ans = []
while True:
t = a.nextSibling
if t is None:
break
a = t
if getattr(t, 'name', False):
if t.get('class', '') == 'toc_parens' or t.name == 'br':
continue
if t.name in ('div', 'a'):
break
ans.append(self.tag_to_string(t))
else:
ans.append(unicode(t))
return u' '.join(ans).replace(u'\xa0', u'').strip()
def postprocess_html(self,soup,first):
for tag in soup.findAll(attrs ={'class':['artPag','pagination']}):
tag.extract()
return soup

View File

@ -64,7 +64,7 @@ class UnitedDaily(BasicNewsRecipe):
__author__ = 'Eddie Lau'
__version__ = '1.1'
language = 'zh-TW'
language = 'zh_TW'
publisher = 'United Daily News Group'
description = 'United Daily (Taiwan)'
category = 'News, Chinese, Taiwan'

71
recipes/utrinski.recipe Normal file
View File

@ -0,0 +1,71 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2011, Darko Spasovski <darko.spasovski at gmail.com>'
'''
utrinski.com.mk
'''
import re
import datetime
from calibre.web.feeds.news import BasicNewsRecipe
class UtrinskiVesnik(BasicNewsRecipe):
__author__ = 'Darko Spasovski'
INDEX = 'http://www.utrinski.com.mk/'
title = 'Utrinski Vesnik'
description = 'Daily Macedonian newspaper'
masthead_url = 'http://www.utrinski.com.mk/images/LogoTop.jpg'
language = 'mk'
remove_javascript = True
publication_type = 'newspaper'
category = 'news, Macedonia'
oldest_article = 2
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
preprocess_regexps = [(re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in
[
## Remove anything before the start of the article.
(r'<body.*?Article start-->', lambda match: '<body>'),
## Remove anything after the end of the article.
(r'<!--Article end.*?</body>', lambda match : '</body>'),
]
]
extra_css = """
body{font-family: Arial,Helvetica,sans-serif}
.WB_UTRINSKIVESNIK_Naslov{FONT-WEIGHT: bold; FONT-SIZE: 18px; FONT-FAMILY: Arial, Verdana, Tahoma; TEXT-DECORATION: none}
"""
conversion_options = {
'comment' : description,
'tags' : category,
'language' : language,
'linearize_tables' : True
}
def parse_index(self):
soup = self.index_to_soup(self.INDEX)
feeds = []
for section in soup.findAll('a', attrs={'class':'WB_UTRINSKIVESNIK_TOCTitleBig'}):
sectionTitle = section.contents[0].string
tocItemTable = section.findAllPrevious('table')[1]
if tocItemTable is None: continue
articles = []
while True:
tocItemTable = tocItemTable.nextSibling
if tocItemTable is None: break
article = tocItemTable.findAll('a', attrs={'class': 'WB_UTRINSKIVESNIK_TocItem'})
if len(article)==0: break
title = self.tag_to_string(article[0], use_alt=True).strip()
articles.append({'title': title, 'url':'http://www.utrinski.com.mk/' + article[0]['href'], 'description':'', 'date':''})
if articles:
feeds.append((sectionTitle, articles))
return feeds
def get_cover_url(self):
datum = datetime.datetime.today().strftime('%d_%m_%Y')
return 'http://www.utrinski.com.mk/WBStorage/Files/' + datum + '.jpg'

30
recipes/vio_mundo.recipe Normal file
View File

@ -0,0 +1,30 @@
import re
from calibre.web.feeds.news import BasicNewsRecipe
class VioMundo(BasicNewsRecipe):
title = 'Blog VioMundo'
__author__ = 'Diniz Bortolotto'
description = 'Posts do Blog VioMundo'
publisher = 'Luiz Carlos Azenha'
oldest_article = 5
max_articles_per_feed = 20
category = 'news, politics, Brazil'
language = 'pt_BR'
publication_type = 'news and politics portal'
use_embedded_content = False
no_stylesheets = True
remove_javascript = True
feeds = [(u'Blog VioMundo', u'http://www.viomundo.com.br/feed')]
reverse_article_order = True
def print_version(self, url):
return url + '/print/'
remove_tags_after = dict(id='BlogContent')
preprocess_regexps = [
(re.compile(r'\|\ <u>.*</p>'),
lambda match: '</p>')
]

View File

@ -1,28 +1,29 @@
__license__ = 'GPL v3'
__copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>'
__copyright__ = '2011, Starson17 <Starson17 at gmail.com>'
'''
www.wired.co.uk
'''
from calibre import strftime
from calibre.web.feeds.news import BasicNewsRecipe
import re
class Wired_UK(BasicNewsRecipe):
title = 'Wired Magazine - UK edition'
__author__ = 'Darko Miletic'
__author__ = 'Starson17'
__version__ = 'v1.30'
__date__ = '15 July 2011'
description = 'Gaming news'
publisher = 'Conde Nast Digital'
category = 'news, games, IT, gadgets'
oldest_article = 32
oldest_article = 40
max_articles_per_feed = 100
no_stylesheets = True
encoding = 'utf-8'
use_embedded_content = False
masthead_url = 'http://www.wired.co.uk/_/media/wired-logo_UK.gif'
#masthead_url = 'http://www.wired.co.uk/_/media/wired-logo_UK.gif'
language = 'en_GB'
extra_css = ' body{font-family: Palatino,"Palatino Linotype","Times New Roman",Times,serif} img{margin-bottom: 0.8em } .img-descr{font-family: Tahoma,Arial,Helvetica,sans-serif; font-size: 0.6875em; display: block} '
index = 'http://www.wired.co.uk/wired-magazine.aspx'
index = 'http://www.wired.co.uk'
conversion_options = {
'comment' : description
@ -31,26 +32,25 @@ class Wired_UK(BasicNewsRecipe):
, 'language' : language
}
keep_only_tags = [dict(name='div', attrs={'class':'article-box'})]
remove_tags = [
dict(name=['object','embed','iframe','link'])
keep_only_tags = [dict(name='div', attrs={'class':['layoutColumn1']})]
remove_tags = [dict(name='div',attrs={'class':['articleSidebar1','commentAddBox linkit','commentCountBox commentCountBoxBig']})]
remove_tags_after = dict(name='div',attrs={'class':['mainCopy entry-content','mainCopy']})
'''
remove_attributes = ['height','width']
,dict(name=['object','embed','iframe','link'])
,dict(attrs={'class':['opts','comment','stories']})
]
remove_tags_after = dict(name='div',attrs={'class':'stories'})
remove_attributes = ['height','width']
'''
def parse_index(self):
totalfeeds = []
soup = self.index_to_soup(self.index)
maincontent = soup.find('div',attrs={'class':'main-content'})
recentcontent = soup.find('ul',attrs={'class':'linkList3'})
mfeed = []
if maincontent:
st = maincontent.find(attrs={'class':'most-wired-box'})
if st:
for itt in st.findAll('a',href=True):
url = 'http://www.wired.co.uk' + itt['href']
title = self.tag_to_string(itt)
if recentcontent:
for li in recentcontent.findAll('li'):
a = li.h2.a
url = self.index + a['href'] + '?page=all'
title = self.tag_to_string(a)
description = ''
date = strftime(self.timefmt)
mfeed.append({
@ -59,16 +59,91 @@ class Wired_UK(BasicNewsRecipe):
,'url' :url
,'description':description
})
totalfeeds.append(('Articles', mfeed))
totalfeeds.append(('Wired UK Magazine Latest News', mfeed))
popmagcontent = soup.findAll('div',attrs={'class':'sidebarLinkList'})
magcontent = popmagcontent[1]
mfeed2 = []
if magcontent:
a = magcontent.h3.a
if a:
url = self.index + a['href'] + '?page=all'
title = self.tag_to_string(a)
description = ''
date = strftime(self.timefmt)
mfeed2.append({
'title' :title
,'date' :date
,'url' :url
,'description':description
})
for li in magcontent.findAll('li'):
a = li.a
url = self.index + a['href'] + '?page=all'
title = self.tag_to_string(a)
description = ''
date = strftime(self.timefmt)
mfeed2.append({
'title' :title
,'date' :date
,'url' :url
,'description':description
})
totalfeeds.append(('Wired UK Magazine Features', mfeed2))
magsoup = self.index_to_soup(self.index + '/magazine')
startcontent = magsoup.find('h3',attrs={'class':'magSubSectionTitle titleStart'}).parent
mfeed3 = []
if startcontent:
for li in startcontent.findAll('li'):
a = li.a
url = self.index + a['href'] + '?page=all'
title = self.tag_to_string(a)
description = ''
date = strftime(self.timefmt)
mfeed3.append({
'title' :title
,'date' :date
,'url' :url
,'description':description
})
totalfeeds.append(('Wired UK Magazine More', mfeed3))
playcontent = magsoup.find('h3',attrs={'class':'magSubSectionTitle titlePlay'}).parent
mfeed4 = []
if playcontent:
for li in playcontent.findAll('li'):
a = li.a
url = self.index + a['href'] + '?page=all'
title = self.tag_to_string(a)
description = ''
date = strftime(self.timefmt)
mfeed4.append({
'title' :title
,'date' :date
,'url' :url
,'description':description
})
totalfeeds.append(('Wired UK Magazine Play', mfeed4))
return totalfeeds
def get_cover_url(self):
cover_url = None
soup = self.index_to_soup(self.index)
cover_item = soup.find('span', attrs={'class':'cover'})
cover_url = ''
soup = self.index_to_soup(self.index + '/magazine/archive')
cover_item = soup.find('div', attrs={'class':'image linkme'})
if cover_item:
cover_url = cover_item.img['src']
return cover_url
def print_version(self, url):
return url + '?page=all'
def preprocess_html(self, soup):
for tag in soup.findAll(name='p'):
if tag.find(name='span', text=re.compile(r'This article was taken from.*', re.DOTALL|re.IGNORECASE)):
tag.extract()
return soup
extra_css = '''
h1{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:large;}
h2{font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:small;}
p{font-family:Arial,Helvetica,sans-serif;font-size:small;}
body{font-family:Helvetica,Arial,sans-serif;font-size:small;}
'''

View File

@ -15,6 +15,7 @@ class ZeitDe(BasicNewsRecipe):
encoding = 'UTF-8'
__author__ = 'Martin Pitt, Sujata Raman, Ingo Paschke and Marc Toensing'
no_stylesheets = True
max_articles_per_feed = 40

View File

@ -2,18 +2,21 @@
# -*- coding: utf-8 mode: python -*-
__license__ = 'GPL v3'
__copyright__ = '2010-2011, Steffen Siebert <calibre at steffensiebert.de>'
__copyright__ = '2010, Steffen Siebert <calibre at steffensiebert.de>'
__docformat__ = 'restructuredtext de'
__version__ = '1.2'
__version__ = '1.5'
"""
Die Zeit EPUB
"""
import os, urllib2, zipfile, re
import os, zipfile, re, cStringIO
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ptempfile import PersistentTemporaryFile
from calibre import walk
from urlparse import urlparse
from contextlib import closing
from calibre.utils.magick.draw import save_cover_data_to
class ZeitEPUBAbo(BasicNewsRecipe):
@ -22,49 +25,112 @@ class ZeitEPUBAbo(BasicNewsRecipe):
language = 'de'
lang = 'de-DE'
__author__ = 'Steffen Siebert and Tobias Isenberg'
__author__ = 'Steffen Siebert, revised by Tobias Isenberg (with some code by Kovid Goyal)'
needs_subscription = True
conversion_options = {
'no_default_epub_cover' : True,
# fixing the wrong left margin
'mobi_ignore_margins' : True,
'keep_ligatures' : True,
}
preprocess_regexps = [
# filtering for correct dashes
(re.compile(r' - '), lambda match: ' '), # regular "Gedankenstrich"
(re.compile(r' -,'), lambda match: ' ,'), # "Gedankenstrich" before a comma
(re.compile(r'(?<=\d)-(?=\d)'), lambda match: ''), # number-number
# filtering for correct dashes ("Gedankenstrich" and "bis")
(re.compile(u' (-|\u2212)(?=[ ,])'), lambda match: u' \u2013'),
(re.compile(r'(?<=\d)-(?=\d)'), lambda match: u'\u2013'), # number-number
(re.compile(u'(?<=\d,)-(?= ?\u20AC)'), lambda match: u'\u2013'), # ,- Euro
# fix the number dash number dash for the title image that was broken by the previous line
(re.compile(u'(?<=\d\d\d\d)\u2013(?=\d?\d\.png)'), lambda match: '-'),
# filtering for certain dash cases
(re.compile(r'Bild - Zeitung'), lambda match: 'Bild-Zeitung'), # the obvious
(re.compile(r'EMail'), lambda match: 'E-Mail'), # the obvious
(re.compile(r'SBahn'), lambda match: 'S-Bahn'), # the obvious
(re.compile(r'UBoot'), lambda match: 'U-Boot'), # the obvious
(re.compile(r'T Shirt'), lambda match: 'T-Shirt'), # the obvious
(re.compile(r'TShirt'), lambda match: 'T-Shirt'), # the obvious
# the next two lines not only fix errors but also create new ones. this is due to additional errors in
# the typesetting such as missing commas or wrongly placed dashes. but more is fixed than broken.
(re.compile(r'(?<!und|der|\w\w,) -(?=\w)'), lambda match: '-'), # space too much before a connecting dash
(re.compile(r'(?<=\w)- (?!und\b|oder\b|wie\b|aber\b|auch\b|sondern\b|bis\b|&amp;|&\s|bzw\.|auf\b|eher\b)'), lambda match: '-'), # space too much after a connecting dash
# filtering for missing spaces before the month in long dates
(re.compile(u'(?<=\d)\.(?=(Januar|Februar|M\u00E4rz|April|Mai|Juni|Juli|August|September|Oktober|November|Dezember))'), lambda match: '. '),
# filtering for other missing spaces
(re.compile(r'Stuttgart21'), lambda match: 'Stuttgart 21'), # the obvious
(re.compile(u'(?<=\d)(?=\u20AC)'), lambda match: u'\u2013'), # Zahl[no space]Euro
(re.compile(r':(?=[^\d\s</])'), lambda match: ': '), # missing space after colon
(re.compile(u'\u00AB(?=[^\-\.:;,\?!<\)\s])'), lambda match: u'\u00AB '), # missing space after closing quotation
(re.compile(u'(?<=[^\s\(>])\u00BB'), lambda match: u' \u00BB'), # missing space before opening quotation
(re.compile(r'(?<=[a-z])(?=(I|II|III|IV|V|VI|VII|VIII|IX|X|XI|XII|XIII|XIV|XV|XVI|XVII|XVIII|XIX|XX)\.)'), lambda match: ' '), # missing space before Roman numeral
(re.compile(r'(?<=(I|V|X)\.)(?=[\w])'), lambda match: ' '), # missing space after Roman numeral
(re.compile(r'(?<=(II|IV|VI|IX|XI|XV|XX)\.)(?=[\w])'), lambda match: ' '), # missing space after Roman numeral
(re.compile(r'(?<=(III|VII|XII|XIV|XVI|XIX)\.)(?=[\w])'), lambda match: ' '), # missing space after Roman numeral
(re.compile(r'(?<=(VIII|XIII|XVII)\.)(?=[\w])'), lambda match: ' '), # missing space after Roman numeral
(re.compile(r'(?<=(XVIII)\.)(?=[\w])'), lambda match: ' '), # missing space after Roman numeral
(re.compile(r'(?<=[A-Za-zÄÖÜäöü]),(?=[A-Za-zÄÖÜäöü])'), lambda match: ', '), # missing space after comma
(re.compile(r'(?<=[a-zäöü])\.(?=[A-ZÄÖÜ][A-Za-zÄÖÜäöü])'), lambda match: '. '), # missing space after full-stop
(re.compile(r'(?<=[uU]\.) (?=a\.)'), lambda match: u'\u2008'), # fix abbreviation that was potentially broken previously
(re.compile(r'(?<=[iI]\.) (?=A\.)'), lambda match: u'\u2008'), # fix abbreviation that was potentially broken previously
(re.compile(r'(?<=[zZ]\.) (?=B\.)'), lambda match: u'\u2008'), # fix abbreviation that was potentially broken previously
(re.compile(r'(?<=\w\.) (?=[A-Z][a-z]*@)'), lambda match: ''), # fix e-mail address that was potentially broken previously
(re.compile(r'(?<=\d)[Pp]rozent'), lambda match: ' Prozent'),
(re.compile(r'\.\.\.\.+'), lambda match: '...'), # too many dots (....)
(re.compile(r'(?<=[^\s])\.\.\.'), lambda match: ' ...'), # spaces before ...
(re.compile(r'\.\.\.(?=[^\s])'), lambda match: '... '), # spaces after ...
(re.compile(r'(?<=[\[\(]) \.\.\. (?=[\]\)])'), lambda match: '...'), # fix special cases of ... in brackets
(re.compile(u'(?<=[\u00BB\u203A]) \.\.\.'), lambda match: '...'), # fix special cases of ... after a quotation mark
(re.compile(u'\.\.\. (?=[\u00AB\u2039,])'), lambda match: '...'), # fix special cases of ... before a quotation mark or comma
# fix missing spaces between numbers and any sort of units, possibly with dot
(re.compile(r'(?<=\d)(?=(Femto|Piko|Nano|Mikro|Milli|Zenti|Dezi|Hekto|Kilo|Mega|Giga|Tera|Peta|Tausend|Trilli|Kubik|Quadrat|Meter|Uhr|Jahr|Schuljahr|Seite))'), lambda match: ' '),
(re.compile(r'(?<=\d\.)(?=(Femto|Piko|Nano|Mikro|Milli|Zenti|Dezi|Hekto|Kilo|Mega|Giga|Tera|Peta|Tausend|Trilli|Kubik|Quadrat|Meter|Uhr|Jahr|Schuljahr|Seite))'), lambda match: ' '),
# fix wrong spaces
(re.compile(r'(?<=<p class="absatz">[A-ZÄÖÜ]) (?=[a-zäöü\-])'), lambda match: ''), # at beginning of paragraphs
(re.compile(u' \u00AB'), lambda match: u'\u00AB '), # before closing quotation
(re.compile(u'\u00BB '), lambda match: u' \u00BB'), # after opening quotation
# filtering for spaces in large numbers for better readability
(re.compile(r'(?<=\d\d)(?=\d\d\d[ ,\.;\)<\?!-])'), lambda match: u'\u2008'), # end of the number with some character following
(re.compile(r'(?<=\d\d)(?=\d\d\d. )'), lambda match: u'\u2008'), # end of the number with full-stop following, then space is necessary (avoid file names)
(re.compile(u'(?<=\d)(?=\d\d\d\u2008)'), lambda match: u'\u2008'), # next level
(re.compile(u'(?<=\d)(?=\d\d\d\u2008)'), lambda match: u'\u2008'), # next level
(re.compile(u'(?<=\d)(?=\d\d\d\u2008)'), lambda match: u'\u2008'), # next level
(re.compile(u'(?<=\d)(?=\d\d\d\u2008)'), lambda match: u'\u2008'), # next level
# filtering for unicode characters that are missing on the Kindle,
# try to replace them with meaningful work-arounds
(re.compile(u'\u2080'), lambda match: '<span style="font-size: 50%;">0</span>'), # subscript-0
(re.compile(u'\u2081'), lambda match: '<span style="font-size: 50%;">1</span>'), # subscript-1
(re.compile(u'\u2082'), lambda match: '<span style="font-size: 50%;">2</span>'), # subscript-2
(re.compile(u'\u2083'), lambda match: '<span style="font-size: 50%;">3</span>'), # subscript-3
(re.compile(u'\u2084'), lambda match: '<span style="font-size: 50%;">4</span>'), # subscript-4
(re.compile(u'\u2085'), lambda match: '<span style="font-size: 50%;">5</span>'), # subscript-5
(re.compile(u'\u2086'), lambda match: '<span style="font-size: 50%;">6</span>'), # subscript-6
(re.compile(u'\u2087'), lambda match: '<span style="font-size: 50%;">7</span>'), # subscript-7
(re.compile(u'\u2088'), lambda match: '<span style="font-size: 50%;">8</span>'), # subscript-8
(re.compile(u'\u2089'), lambda match: '<span style="font-size: 50%;">9</span>'), # subscript-9
(re.compile(u'\u2080'), lambda match: '<span style="font-size: 40%;">0</span>'), # subscript-0
(re.compile(u'\u2081'), lambda match: '<span style="font-size: 40%;">1</span>'), # subscript-1
(re.compile(u'\u2082'), lambda match: '<span style="font-size: 40%;">2</span>'), # subscript-2
(re.compile(u'\u2083'), lambda match: '<span style="font-size: 40%;">3</span>'), # subscript-3
(re.compile(u'\u2084'), lambda match: '<span style="font-size: 40%;">4</span>'), # subscript-4
(re.compile(u'\u2085'), lambda match: '<span style="font-size: 40%;">5</span>'), # subscript-5
(re.compile(u'\u2086'), lambda match: '<span style="font-size: 40%;">6</span>'), # subscript-6
(re.compile(u'\u2087'), lambda match: '<span style="font-size: 40%;">7</span>'), # subscript-7
(re.compile(u'\u2088'), lambda match: '<span style="font-size: 40%;">8</span>'), # subscript-8
(re.compile(u'\u2089'), lambda match: '<span style="font-size: 40%;">9</span>'), # subscript-9
# always chance CO2
(re.compile(r'CO2'), lambda match: 'CO<span style="font-size: 40%;">2</span>'), # CO2
# remove *** paragraphs
(re.compile(r'<p class="absatz">\*\*\*</p>'), lambda match: ''),
# better layout for the top line of each article
(re.compile(u'(?<=DIE ZEIT N\u00B0 \d /) (?=\d\d)'), lambda match: ' 20'), # proper year in edition number
(re.compile(u'(?<=DIE ZEIT N\u00B0 \d\d /) (?=\d\d)'), lambda match: ' 20'), # proper year in edition number
(re.compile(u'(?<=>)(?=DIE ZEIT N\u00B0 \d\d / 20\d\d)'), lambda match: u' \u2014 '), # m-dash between category and DIE ZEIT
]
def build_index(self):
domain = "http://premium.zeit.de"
url = domain + "/abovorteile/cgi-bin/_er_member/p4z.fpl?ER_Do=getUserData&ER_NextTemplate=login_ok"
domain = "https://premium.zeit.de"
url = domain + "/abo/zeit_digital"
browser = self.get_browser()
browser.add_password("http://premium.zeit.de", self.username, self.password)
try:
browser.open(url)
except urllib2.HTTPError:
self.report_progress(0,_("Can't login to download issue"))
raise ValueError('Failed to login, check your username and password')
response = browser.follow_link(text="DIE ZEIT als E-Paper")
response = browser.follow_link(url_regex=re.compile('^http://contentserver.hgv-online.de/nodrm/fulfillment\\?distributor=zeit-online&orderid=zeit_online.*'))
# new login process
response = browser.open(url)
browser.select_form(nr=2)
browser.form['name']=self.username
browser.form['pass']=self.password
browser.submit()
# now find the correct file, we will still use the ePub file
epublink = browser.find_link(text_regex=re.compile('.*Ausgabe als Datei im ePub-Format.*'))
response = browser.follow_link(epublink)
self.report_progress(1,_('next step'))
tmp = PersistentTemporaryFile(suffix='.epub')
self.report_progress(0,_('downloading epub'))
@ -104,9 +170,45 @@ class ZeitEPUBAbo(BasicNewsRecipe):
# getting url of the cover
def get_cover_url(self):
self.log.warning('Downloading cover')
try:
self.log.warning('Trying PDF-based cover')
domain = "https://premium.zeit.de"
url = domain + "/abo/zeit_digital"
browser = self.get_browser()
# new login process
browser.open(url)
browser.select_form(nr=2)
browser.form['name']=self.username
browser.form['pass']=self.password
browser.submit()
# actual cover search
pdflink = browser.find_link(url_regex=re.compile('system/files/epaper/DZ/pdf/DZ_ePaper*'))
cover_url = urlparse(pdflink.base_url)[0]+'://'+urlparse(pdflink.base_url)[1]+''+(urlparse(pdflink.url)[2]).replace('ePaper_','').replace('.pdf','_001.pdf')
self.log.warning('PDF link found:')
self.log.warning(cover_url)
# download the cover (has to be here due to new login process)
with closing(browser.open(cover_url)) as r:
cdata = r.read()
from calibre.ebooks.metadata.pdf import get_metadata
stream = cStringIO.StringIO(cdata)
cdata = None
mi = get_metadata(stream)
if mi.cover_data and mi.cover_data[1]:
cdata = mi.cover_data[1]
cpath = os.path.join(self.output_dir, 'cover.jpg')
save_cover_data_to(cdata, cpath)
cover_url = cpath
except:
self.log.warning('Trying low-res cover')
try:
inhalt = self.index_to_soup('http://www.zeit.de/inhalt')
cover_url = inhalt.find('div', attrs={'class':'singlearchive clearfix'}).img['src'].replace('icon_','')
except:
self.log.warning('Using static old low-res cover')
cover_url = 'http://images.zeit.de/bilder/titelseiten_zeit/1946/001_001.jpg'
return cover_url

View File

@ -11,6 +11,7 @@
<link rel="stylesheet" type="text/css" href="{prefix}/static/browse/browse.css" />
<link type="text/css" href="{prefix}/static/jquery_ui/css/humanity-custom/jquery-ui-1.8.5.custom.css" rel="stylesheet" />
<link rel="stylesheet" type="text/css" href="{prefix}/static/jquery.multiselect.css" />
<link rel="apple-touch-icon" href="/static/calibre.png" />
<script type="text/javascript" src="{prefix}/static/jquery.js"></script>
<script type="text/javascript" src="{prefix}/static/jquery.corner.js"></script>

View File

@ -11,7 +11,7 @@ defaults.
'''
#: Auto increment series index
# The algorithm used to assign a new book in an existing series a series number.
# The algorithm used to assign a book added to an existing series a series number.
# New series numbers assigned using this tweak are always integer values, except
# if a constant non-integer is specified.
# Possible values are:
@ -27,7 +27,19 @@ defaults.
# series_index_auto_increment = 'next'
# series_index_auto_increment = 'next_free'
# series_index_auto_increment = 16.5
#
# Set the use_series_auto_increment_tweak_when_importing tweak to True to
# use the above values when importing/adding books. If this tweak is set to
# False (the default) then the series number will be set to 1 if it is not
# explicitly set to during the import. If set to True, then the
# series index will be set according to the series_index_auto_increment setting.
# Note that the use_series_auto_increment_tweak_when_importing tweak is used
# only when a value is not provided during import. If the importing regular
# expression produces a value for series_index, or if you are reading metadata
# from books and the import plugin produces a value, than that value will
# be used irrespective of the setting of the tweak.
series_index_auto_increment = 'next'
use_series_auto_increment_tweak_when_importing = False
#: Add separator after completing an author name
# Should the completion separator be append
@ -366,3 +378,10 @@ server_listen_on = '0.0.0.0'
# on at your own risk!
unified_title_toolbar_on_osx = False
#: Save original file when converting from same format to same format
# When calibre does a conversion from the same format to the same format, for
# example, from EPUB to EPUB, the original file is saved, so that in case the
# conversion is poor, you can tweak the settings and run it again. By setting
# this to False you can prevent calibre from saving the original file.
save_original_format = True

Binary file not shown.

Before

Width:  |  Height:  |  Size: 5.7 KiB

After

Width:  |  Height:  |  Size: 4.3 KiB

BIN
resources/images/random.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 4.3 KiB

View File

@ -379,7 +379,8 @@
<!-- image -->
<xsl:template match="fb:image">
<div align="center">
<img border="1">
<xsl:element name="img">
<xsl:attribute name="border">1</xsl:attribute>
<xsl:choose>
<xsl:when test="starts-with(@xlink:href,'#')">
<xsl:attribute name="src"><xsl:value-of select="substring-after(@xlink:href,'#')"/></xsl:attribute>
@ -388,7 +389,10 @@
<xsl:attribute name="src"><xsl:value-of select="@xlink:href"/></xsl:attribute>
</xsl:otherwise>
</xsl:choose>
</img>
<xsl:if test="@title">
<xsl:attribute name="title"><xsl:value-of select="@title"/></xsl:attribute>
</xsl:if>
</xsl:element>
</div>
</xsl:template>
</xsl:stylesheet>

View File

@ -1,5 +1,5 @@
" Project wide builtins
let g:pyflakes_builtins += ["dynamic_property", "__", "P", "I", "lopen", "icu_lower", "icu_upper", "icu_title", "ngettext"]
let g:pyflakes_builtins = ["_", "dynamic_property", "__", "P", "I", "lopen", "icu_lower", "icu_upper", "icu_title", "ngettext"]
python << EOFPY
import os
@ -15,7 +15,7 @@ vipy.session.initialize(project_name='calibre', src_dir=src_dir,
project_dir=project_dir, base_dir=base_dir)
def recipe_title_callback(raw):
return eval(raw.decode('utf-8'))
return eval(raw.decode('utf-8')).replace(' ', '_')
vipy.session.add_content_browser('.r', ',r', 'Recipe',
vipy.session.glob_based_iterator(os.path.join(project_dir, 'recipes', '*.recipe')),

View File

@ -25,18 +25,11 @@ class Message:
return '%s:%s: %s'%(self.filename, self.lineno, self.msg)
def check_for_python_errors(code_string, filename):
# Since compiler.parse does not reliably report syntax errors, use the
# built in compiler first to detect those.
import _ast
# First, compile into an AST and handle syntax errors.
try:
try:
compile(code_string, filename, "exec")
except MemoryError:
# Python 2.4 will raise MemoryError if the source can't be
# decoded.
if sys.version_info[:2] == (2, 4):
raise SyntaxError(None)
raise
except (SyntaxError, IndentationError), value:
tree = compile(code_string, filename, "exec", _ast.PyCF_ONLY_AST)
except (SyntaxError, IndentationError) as value:
msg = value.args[0]
(lineno, offset, text) = value.lineno, value.offset, value.text
@ -47,13 +40,11 @@ def check_for_python_errors(code_string, filename):
# bogus message that claims the encoding the file declared was
# unknown.
msg = "%s: problem decoding source" % filename
return [Message(filename, lineno, msg)]
else:
# Okay, it's syntactically valid. Now parse it into an ast and check
# it.
import compiler
checker = __import__('pyflakes.checker').checker
tree = compiler.parse(code_string)
# Okay, it's syntactically valid. Now check it.
w = checker.Checker(tree, filename)
w.messages.sort(lambda a, b: cmp(a.lineno, b.lineno))
return [Message(x.filename, x.lineno, x.message%x.message_args) for x in

View File

@ -373,7 +373,7 @@ class Win32Freeze(Command, WixMixIn):
src = self.j(self.src_root, 'setup', 'installer', 'windows',
'portable.c')
obj = self.j(self.obj_dir, self.b(src)+'.obj')
cflags = '/c /EHsc /MT /W3 /Ox /nologo /D_UNICODE'.split()
cflags = '/c /EHsc /MT /W3 /Ox /nologo /D_UNICODE /DUNICODE'.split()
if self.newer(obj, [src]):
self.info('Compiling', obj)
@ -386,6 +386,7 @@ class Win32Freeze(Command, WixMixIn):
cmd = [msvc.linker] + ['/INCREMENTAL:NO', '/MACHINE:X86',
'/LIBPATH:'+self.obj_dir, '/SUBSYSTEM:WINDOWS',
'/RELEASE',
'/ENTRY:wWinMainCRTStartup',
'/OUT:'+exe, self.embed_resources(exe),
obj, 'User32.lib']
self.run_builder(cmd)

View File

@ -2,15 +2,21 @@
#define UNICODE
#endif
#ifndef _UNICODE
#define _UNICODE
#endif
#include <windows.h>
#include <tchar.h>
#include <wchar.h>
#include <stdio.h>
#define BUFSIZE 4096
void show_error(LPCTSTR msg) {
MessageBeep(MB_ICONERROR);
MessageBox(NULL, msg, TEXT("Error"), MB_OK|MB_ICONERROR);
MessageBox(NULL, msg, _T("Error"), MB_OK|MB_ICONERROR);
}
void show_detailed_error(LPCTSTR preamble, LPCTSTR msg, int code) {
@ -20,7 +26,7 @@ void show_detailed_error(LPCTSTR preamble, LPCTSTR msg, int code) {
_sntprintf_s(buf,
LocalSize(buf) / sizeof(TCHAR), _TRUNCATE,
TEXT("%s\r\n %s (Error Code: %d)\r\n"),
_T("%s\r\n %s (Error Code: %d)\r\n"),
preamble, msg, code);
show_error(buf);
@ -32,7 +38,7 @@ void show_last_error_crt(LPCTSTR preamble) {
int err = 0;
_get_errno(&err);
_wcserror_s(buf, BUFSIZE, err);
_tcserror_s(buf, BUFSIZE, err);
show_detailed_error(preamble, buf, err);
}
@ -57,7 +63,7 @@ void show_last_error(LPCTSTR preamble) {
LPTSTR get_app_dir() {
LPTSTR buf, buf2, buf3;
DWORD sz;
TCHAR drive[4] = TEXT("\0\0\0");
TCHAR drive[4] = _T("\0\0\0");
errno_t err;
buf = (LPTSTR)calloc(BUFSIZE, sizeof(TCHAR));
@ -67,18 +73,18 @@ LPTSTR get_app_dir() {
sz = GetModuleFileName(NULL, buf, BUFSIZE);
if (sz == 0 || sz > BUFSIZE-1) {
show_error(TEXT("Failed to get path to calibre-portable.exe"));
show_error(_T("Failed to get path to calibre-portable.exe"));
ExitProcess(1);
}
err = _tsplitpath_s(buf, drive, 4, buf2, BUFSIZE, NULL, 0, NULL, 0);
if (err != 0) {
show_last_error_crt(TEXT("Failed to split path to calibre-portable.exe"));
show_last_error_crt(_T("Failed to split path to calibre-portable.exe"));
ExitProcess(1);
}
_sntprintf_s(buf3, BUFSIZE-1, _TRUNCATE, TEXT("%s%s"), drive, buf2);
_sntprintf_s(buf3, BUFSIZE-1, _TRUNCATE, _T("%s%s"), drive, buf2);
free(buf); free(buf2);
return buf3;
}
@ -90,18 +96,18 @@ void launch_calibre(LPCTSTR exe, LPCTSTR config_dir, LPCTSTR library_dir) {
BOOL fSuccess;
TCHAR cmdline[BUFSIZE];
if (! SetEnvironmentVariable(TEXT("CALIBRE_CONFIG_DIRECTORY"), config_dir)) {
show_last_error(TEXT("Failed to set environment variables"));
if (! SetEnvironmentVariable(_T("CALIBRE_CONFIG_DIRECTORY"), config_dir)) {
show_last_error(_T("Failed to set environment variables"));
ExitProcess(1);
}
if (! SetEnvironmentVariable(TEXT("CALIBRE_PORTABLE_BUILD"), exe)) {
show_last_error(TEXT("Failed to set environment variables"));
if (! SetEnvironmentVariable(_T("CALIBRE_PORTABLE_BUILD"), exe)) {
show_last_error(_T("Failed to set environment variables"));
ExitProcess(1);
}
dwFlags = CREATE_UNICODE_ENVIRONMENT | CREATE_NEW_PROCESS_GROUP;
_sntprintf_s(cmdline, BUFSIZE, _TRUNCATE, TEXT(" \"--with-library=%s\""), library_dir);
_sntprintf_s(cmdline, BUFSIZE, _TRUNCATE, _T(" \"--with-library=%s\""), library_dir);
ZeroMemory( &si, sizeof(si) );
si.cb = sizeof(si);
@ -119,7 +125,7 @@ void launch_calibre(LPCTSTR exe, LPCTSTR config_dir, LPCTSTR library_dir) {
);
if (fSuccess == 0) {
show_last_error(TEXT("Failed to launch the calibre program"));
show_last_error(_T("Failed to launch the calibre program"));
}
// Close process and thread handles.
@ -137,9 +143,9 @@ int WINAPI wWinMain(HINSTANCE hInstance, HINSTANCE hPrevInstance, PWSTR pCmdLine
library_dir = (LPTSTR)calloc(BUFSIZE, sizeof(TCHAR));
exe = (LPTSTR)calloc(BUFSIZE, sizeof(TCHAR));
_sntprintf_s(config_dir, BUFSIZE, _TRUNCATE, TEXT("%sCalibre Settings"), app_dir);
_sntprintf_s(exe, BUFSIZE, _TRUNCATE, TEXT("%sCalibre\\calibre.exe"), app_dir);
_sntprintf_s(library_dir, BUFSIZE, _TRUNCATE, TEXT("%sCalibre Library"), app_dir);
_sntprintf_s(config_dir, BUFSIZE, _TRUNCATE, _T("%sCalibre Settings"), app_dir);
_sntprintf_s(exe, BUFSIZE, _TRUNCATE, _T("%sCalibre\\calibre.exe"), app_dir);
_sntprintf_s(library_dir, BUFSIZE, _TRUNCATE, _T("%sCalibre Library"), app_dir);
launch_calibre(exe, config_dir, library_dir);

View File

@ -205,8 +205,8 @@ class Resources(Command):
dest = self.j(self.RESOURCES, 'template-functions.json')
function_dict = {}
import inspect
from calibre.utils.formatter_functions import all_builtin_functions
for obj in all_builtin_functions:
from calibre.utils.formatter_functions import formatter_functions
for obj in formatter_functions.get_builtins().values():
eval_func = inspect.getmembers(obj,
lambda x: inspect.ismethod(x) and x.__name__ == 'evaluate')
try:

View File

@ -8,11 +8,18 @@ __docformat__ = 'restructuredtext en'
import os, tempfile, shutil, subprocess, glob, re, time, textwrap
from distutils import sysconfig
from functools import partial
from setup import Command, __appname__, __version__
from setup.build_environment import pyqt
class POT(Command):
def qt_sources():
qtdir = glob.glob('/usr/src/qt-*')[-1]
j = partial(os.path.join, qtdir)
return list(map(j, [
'src/gui/widgets/qdialogbuttonbox.cpp',
]))
class POT(Command): # {{{
description = 'Update the .pot translation template'
PATH = os.path.join(Command.SRC, __appname__, 'translations')
@ -82,6 +89,8 @@ class POT(Command):
time=time.strftime('%Y-%m-%d %H:%M+%Z'))
files = self.source_files()
qt_inputs = qt_sources()
with tempfile.NamedTemporaryFile() as fl:
fl.write('\n'.join(files))
fl.flush()
@ -91,8 +100,14 @@ class POT(Command):
subprocess.check_call(['xgettext', '-f', fl.name,
'--default-domain=calibre', '-o', out.name, '-L', 'Python',
'--from-code=UTF-8', '--sort-by-file', '--omit-header',
'--no-wrap', '-k__',
'--no-wrap', '-k__', '--add-comments=NOTE:',
])
subprocess.check_call(['xgettext', '-j',
'--default-domain=calibre', '-o', out.name,
'--from-code=UTF-8', '--sort-by-file', '--omit-header',
'--no-wrap', '-kQT_TRANSLATE_NOOP:2',
] + qt_inputs)
with open(out.name, 'rb') as f:
src = f.read()
os.remove(out.name)
@ -102,10 +117,12 @@ class POT(Command):
with open(pot, 'wb') as f:
f.write(src)
self.info('Translations template:', os.path.abspath(pot))
return pot
# }}}
class Translations(POT):
class Translations(POT): # {{{
description='''Compile the translations'''
DEST = os.path.join(os.path.dirname(POT.SRC), 'resources', 'localization',
'locales')
@ -117,7 +134,6 @@ class Translations(POT):
locale = os.path.splitext(os.path.basename(po_file))[0]
return locale, os.path.join(self.DEST, locale, 'messages.mo')
def run(self, opts):
for f in self.po_files():
locale, dest = self.mo_file(f)
@ -126,7 +142,7 @@ class Translations(POT):
os.makedirs(base)
self.info('\tCompiling translations for', locale)
subprocess.check_call(['msgfmt', '-o', dest, f])
if locale in ('en_GB', 'nds', 'te', 'yi'):
if locale in ('en_GB', 'en_CA', 'en_AU', 'si', 'ur', 'sc', 'ltg', 'nds', 'te', 'yi'):
continue
pycountry = self.j(sysconfig.get_python_lib(), 'pycountry',
'locales', locale, 'LC_MESSAGES')
@ -140,17 +156,6 @@ class Translations(POT):
self.warn('No ISO 639 translations for locale:', locale,
'\nDo you have pycountry installed?')
base = os.path.join(pyqt.qt_data_dir, 'translations')
qt_translations = glob.glob(os.path.join(base, 'qt_*.qm'))
if not qt_translations:
raise Exception('Could not find qt translations')
for f in qt_translations:
locale = self.s(self.b(f))[0][3:]
dest = self.j(self.DEST, locale, 'LC_MESSAGES', 'qt.qm')
if self.e(self.d(dest)) and self.newer(dest, f):
self.info('\tCopying Qt translation for locale:', locale)
shutil.copy2(f, dest)
self.write_stats()
self.freeze_locales()
@ -201,7 +206,7 @@ class Translations(POT):
for x in (i, j, d):
if os.path.exists(x):
os.remove(x)
# }}}
class GetTranslations(Translations):

View File

@ -341,7 +341,7 @@ def random_user_agent():
def browser(honor_time=True, max_time=2, mobile_browser=False, user_agent=None):
'''
Create a mechanize browser for web scraping. The browser handles cookies,
refresh requests and ignores robots.txt. Also uses proxy if avaialable.
refresh requests and ignores robots.txt. Also uses proxy if available.
:param honor_time: If True honors pause time in refresh requests
:param max_time: Maximum time in seconds to wait during a refresh request
@ -353,9 +353,14 @@ def browser(honor_time=True, max_time=2, mobile_browser=False, user_agent=None):
if user_agent is None:
user_agent = USER_AGENT_MOBILE if mobile_browser else USER_AGENT
opener.addheaders = [('User-agent', user_agent)]
http_proxy = get_proxies().get('http', None)
proxies = get_proxies()
http_proxy = proxies.get('http', None)
if http_proxy:
opener.set_proxies({'http':http_proxy})
https_proxy = proxies.get('https', None)
if https_proxy:
opener.set_proxies({'https':https_proxy})
return opener
def fit_image(width, height, pwidth, pheight):
@ -474,7 +479,7 @@ def strftime(fmt, t=None):
def my_unichr(num):
try:
return unichr(num)
except ValueError:
except (ValueError, OverflowError):
return u'?'
def entity_to_unicode(match, exceptions=[], encoding='cp1252',

View File

@ -4,7 +4,7 @@ __license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
__docformat__ = 'restructuredtext en'
__appname__ = u'calibre'
numeric_version = (0, 8, 9)
numeric_version = (0, 8, 12)
__version__ = u'.'.join(map(unicode, numeric_version))
__author__ = u"Kovid Goyal <kovid@kovidgoyal.net>"

View File

@ -570,7 +570,7 @@ from calibre.devices.teclast.driver import (TECLAST_K3, NEWSMY, IPAPYRUS,
from calibre.devices.sne.driver import SNE
from calibre.devices.misc import (PALMPRE, AVANT, SWEEX, PDNOVEL,
GEMEI, VELOCITYMICRO, PDNOVEL_KOBO, LUMIREAD, ALURATEK_COLOR,
TREKSTOR, EEEREADER, NEXTBOOK, ADAM, MOOVYBOOK)
TREKSTOR, EEEREADER, NEXTBOOK, ADAM, MOOVYBOOK, COBY)
from calibre.devices.folder_device.driver import FOLDER_DEVICE_FOR_CONFIG
from calibre.devices.kobo.driver import KOBO
from calibre.devices.bambook.driver import BAMBOOK
@ -705,7 +705,7 @@ plugins += [
EEEREADER,
NEXTBOOK,
ADAM,
MOOVYBOOK,
MOOVYBOOK, COBY,
ITUNES,
BOEYE_BEX,
BOEYE_BDX,
@ -843,6 +843,12 @@ class ActionNextMatch(InterfaceActionBase):
description = _('Find the next or previous match when searching in '
'your calibre library in highlight mode')
class ActionPickRandom(InterfaceActionBase):
name = 'Pick Random Book'
actual_plugin = 'calibre.gui2.actions.random:PickRandomAction'
description = _('Choose a random book from your calibre library')
class ActionStore(InterfaceActionBase):
name = 'Store'
author = 'John Schember'
@ -873,7 +879,7 @@ plugins += [ActionAdd, ActionFetchAnnotations, ActionGenerateCatalog,
ActionSendToDevice, ActionHelp, ActionPreferences, ActionSimilarBooks,
ActionAddToLibrary, ActionEditCollections, ActionChooseLibrary,
ActionCopyToLibrary, ActionTweakEpub, ActionNextMatch, ActionStore,
ActionPluginUpdater]
ActionPluginUpdater, ActionPickRandom]
# }}}
@ -1181,6 +1187,26 @@ class StoreBeWriteStore(StoreBase):
headquarters = 'US'
formats = ['EPUB', 'MOBI', 'PDF']
class StoreBookotekaStore(StoreBase):
name = 'Bookoteka'
author = u'Tomasz Długosz'
description = u'E-booki w Bookotece dostępne są w formacie EPUB oraz PDF. Publikacje sprzedawane w Bookotece są objęte prawami autorskimi. Zobowiązaliśmy się chronić te prawa, ale bez ograniczania dostępu do książki użytkownikowi, który nabył ją w legalny sposób. Dlatego też Bookoteka stosuje tak zwany „watermarking transakcyjny” czyli swego rodzaju znaki wodne.'
actual_plugin = 'calibre.gui2.store.stores.bookoteka_plugin:BookotekaStore'
drm_free_only = True
headquarters = 'PL'
formats = ['EPUB', 'PDF']
class StoreChitankaStore(StoreBase):
name = u'Моята библиотека'
author = 'Alex Stanev'
description = u'Независим сайт за DRM свободна литература на български език'
actual_plugin = 'calibre.gui2.store.stores.chitanka_plugin:ChitankaStore'
drm_free_only = True
headquarters = 'BG'
formats = ['FB2', 'EPUB', 'TXT', 'SFB']
class StoreDieselEbooksStore(StoreBase):
name = 'Diesel eBooks'
description = u'Instant access to over 2.4 million titles from hundreds of publishers including Harlequin, HarperCollins, John Wiley & Sons, McGraw-Hill, Simon & Schuster and Random House.'
@ -1190,6 +1216,15 @@ class StoreDieselEbooksStore(StoreBase):
formats = ['EPUB', 'PDF']
affiliate = True
class StoreEbookNLStore(StoreBase):
name = 'eBook.nl'
description = u'De eBookwinkel van Nederland'
actual_plugin = 'calibre.gui2.store.stores.ebook_nl_plugin:EBookNLStore'
headquarters = 'NL'
formats = ['EPUB', 'PDF']
affiliate = True
class StoreEbookscomStore(StoreBase):
name = 'eBooks.com'
description = u'Sells books in multiple electronic formats in all categories. Technical infrastructure is cutting edge, robust and scalable, with servers in the US and Europe.'
@ -1199,17 +1234,6 @@ class StoreEbookscomStore(StoreBase):
formats = ['EPUB', 'LIT', 'MOBI', 'PDF']
affiliate = True
class StoreEPubBuyDEStore(StoreBase):
name = 'EPUBBuy DE'
author = 'Charles Haley'
description = u'Bei EPUBBuy.com finden Sie ausschliesslich eBooks im weitverbreiteten EPUB-Format und ohne DRM. So haben Sie die freie Wahl, wo Sie Ihr eBook lesen: Tablet, eBook-Reader, Smartphone oder einfach auf Ihrem PC. So macht eBook-Lesen Spaß!'
actual_plugin = 'calibre.gui2.store.stores.epubbuy_de_plugin:EPubBuyDEStore'
drm_free_only = True
headquarters = 'DE'
formats = ['EPUB']
affiliate = True
class StoreEBookShoppeUKStore(StoreBase):
name = 'ebookShoppe UK'
author = u'Charles Haley'
@ -1229,14 +1253,15 @@ class StoreEHarlequinStore(StoreBase):
formats = ['EPUB', 'PDF']
affiliate = True
class StoreEpubBudStore(StoreBase):
name = 'ePub Bud'
description = 'Well, it\'s pretty much just "YouTube for Children\'s eBooks. A not-for-profit organization devoted to brining self published childrens books to the world.'
actual_plugin = 'calibre.gui2.store.stores.epubbud_plugin:EpubBudStore'
class StoreEKnigiStore(StoreBase):
name = u'еКниги'
author = 'Alex Stanev'
description = u'Онлайн книжарница за електронни книги и аудио риалити романи'
actual_plugin = 'calibre.gui2.store.stores.eknigi_plugin:eKnigiStore'
drm_free_only = True
headquarters = 'US'
formats = ['EPUB']
headquarters = 'BG'
formats = ['EPUB', 'PDF', 'HTML']
affiliate = True
class StoreFeedbooksStore(StoreBase):
name = 'Feedbooks'
@ -1272,6 +1297,7 @@ class StoreGoogleBooksStore(StoreBase):
headquarters = 'US'
formats = ['EPUB', 'PDF', 'TXT']
affiliate = True
class StoreGutenbergStore(StoreBase):
name = 'Project Gutenberg'
@ -1355,6 +1381,17 @@ class StoreOReillyStore(StoreBase):
headquarters = 'US'
formats = ['APK', 'DAISY', 'EPUB', 'MOBI', 'PDF']
class StoreOzonRUStore(StoreBase):
name = 'OZON.ru'
description = u'ebooks from OZON.ru'
actual_plugin = 'calibre.gui2.store.stores.ozon_ru_plugin:OzonRUStore'
author = 'Roman Mukhin'
drm_free_only = True
headquarters = 'RU'
formats = ['TXT', 'PDF', 'DJVU', 'RTF', 'DOC', 'JAR', 'FB2']
affiliate = True
class StorePragmaticBookshelfStore(StoreBase):
name = 'Pragmatic Bookshelf'
description = u'The Pragmatic Bookshelf\'s collection of programming and tech books avaliable as ebooks.'
@ -1446,12 +1483,14 @@ plugins += [
StoreBNStore,
StoreBeamEBooksDEStore,
StoreBeWriteStore,
StoreBookotekaStore,
StoreChitankaStore,
StoreDieselEbooksStore,
StoreEbookNLStore,
StoreEbookscomStore,
StoreEBookShoppeUKStore,
StoreEPubBuyDEStore,
StoreEHarlequinStore,
StoreEpubBudStore,
StoreEKnigiStore,
StoreFeedbooksStore,
StoreFoylesUKStore,
StoreGandalfStore,
@ -1465,6 +1504,7 @@ plugins += [
StoreNextoStore,
StoreOpenBooksStore,
StoreOReillyStore,
StoreOzonRUStore,
StorePragmaticBookshelfStore,
StoreSmashwordsStore,
StoreVirtualoStore,

View File

@ -8,7 +8,7 @@ __copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
# Imports {{{
import os, shutil, uuid, json
import os, shutil, uuid, json, glob, time, tempfile
from functools import partial
import apsw
@ -25,7 +25,7 @@ from calibre.utils.config import to_json, from_json, prefs, tweaks
from calibre.utils.date import utcfromtimestamp, parse_date
from calibre.utils.filenames import is_case_sensitive
from calibre.db.tables import (OneToOneTable, ManyToOneTable, ManyToManyTable,
SizeTable, FormatsTable, AuthorsTable, IdentifiersTable)
SizeTable, FormatsTable, AuthorsTable, IdentifiersTable, CompositeTable)
# }}}
'''
@ -37,6 +37,8 @@ Differences in semantics from pysqlite:
'''
SPOOL_SIZE = 30*1024*1024
class DynamicFilter(object): # {{{
'No longer used, present for legacy compatibility'
@ -478,7 +480,6 @@ class DB(object):
remove.append(data)
continue
self.custom_column_label_map[data['label']] = data['num']
self.custom_column_num_map[data['num']] = \
self.custom_column_label_map[data['label']] = data
@ -613,10 +614,31 @@ class DB(object):
tables['size'] = SizeTable('size', self.field_metadata['size'].copy())
for label, data in self.custom_column_label_map.iteritems():
label = '#' + label
self.FIELD_MAP = {'id':0, 'title':1, 'authors':2, 'timestamp':3,
'size':4, 'rating':5, 'tags':6, 'comments':7, 'series':8,
'publisher':9, 'series_index':10, 'sort':11, 'author_sort':12,
'formats':13, 'path':14, 'pubdate':15, 'uuid':16, 'cover':17,
'au_map':18, 'last_modified':19, 'identifiers':20}
for k,v in self.FIELD_MAP.iteritems():
self.field_metadata.set_field_record_index(k, v, prefer_custom=False)
base = max(self.FIELD_MAP.itervalues())
for label_, data in self.custom_column_label_map.iteritems():
label = self.field_metadata.custom_field_prefix + label_
metadata = self.field_metadata[label].copy()
link_table = self.custom_table_names(data['num'])[1]
self.FIELD_MAP[data['num']] = base = base+1
self.field_metadata.set_field_record_index(label_, base,
prefer_custom=True)
if data['datatype'] == 'series':
# account for the series index column. Field_metadata knows that
# the series index is one larger than the series. If you change
# it here, be sure to change it there as well.
self.FIELD_MAP[str(data['num'])+'_index'] = base = base+1
self.field_metadata.set_field_record_index(label_+'_index', base,
prefer_custom=True)
if data['normalized']:
if metadata['is_multiple']:
@ -632,8 +654,17 @@ class DB(object):
metadata['column'] = 'extra'
metadata['table'] = link_table
tables[label] = OneToOneTable(label, metadata)
else:
if data['datatype'] == 'composite':
tables[label] = CompositeTable(label, metadata)
else:
tables[label] = OneToOneTable(label, metadata)
self.FIELD_MAP['ondevice'] = base = base+1
self.field_metadata.set_field_record_index('ondevice', base, prefer_custom=False)
self.FIELD_MAP['marked'] = base = base+1
self.field_metadata.set_field_record_index('marked', base, prefer_custom=False)
# }}}
@property
@ -732,5 +763,57 @@ class DB(object):
pprint.pprint(table.metadata)
raise
def format_abspath(self, book_id, fmt, fname, path):
path = os.path.join(self.library_path, path)
fmt = ('.' + fmt.lower()) if fmt else ''
fmt_path = os.path.join(path, fname+fmt)
if os.path.exists(fmt_path):
return fmt_path
try:
candidates = glob.glob(os.path.join(path, '*'+fmt))
except: # If path contains strange characters this throws an exc
candidates = []
if fmt and candidates and os.path.exists(candidates[0]):
shutil.copyfile(candidates[0], fmt_path)
return fmt_path
def format_metadata(self, book_id, fmt, fname, path):
path = self.format_abspath(book_id, fmt, fname, path)
ans = {}
if path is not None:
stat = os.stat(path)
ans['size'] = stat.st_size
ans['mtime'] = utcfromtimestamp(stat.st_mtime)
return ans
def cover(self, path, as_file=False, as_image=False,
as_path=False):
path = os.path.join(self.library_path, path, 'cover.jpg')
ret = None
if os.access(path, os.R_OK):
try:
f = lopen(path, 'rb')
except (IOError, OSError):
time.sleep(0.2)
f = lopen(path, 'rb')
with f:
if as_path:
pt = PersistentTemporaryFile('_dbcover.jpg')
with pt:
shutil.copyfileobj(f, pt)
return pt.name
if as_file:
ret = tempfile.SpooledTemporaryFile(SPOOL_SIZE)
shutil.copyfileobj(f, ret)
ret.seek(0)
else:
ret = f.read()
if as_image:
from PyQt4.Qt import QImage
i = QImage()
i.loadFromData(ret)
ret = i
return ret
# }}}

View File

@ -7,5 +7,380 @@ __license__ = 'GPL v3'
__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import os
from collections import defaultdict
from functools import wraps, partial
from calibre.db.locking import create_locks, RecordLock
from calibre.db.fields import create_field
from calibre.ebooks.book.base import Metadata
from calibre.utils.date import now
def api(f):
f.is_cache_api = True
return f
def read_api(f):
f = api(f)
f.is_read_api = True
return f
def write_api(f):
f = api(f)
f.is_read_api = False
return f
def wrap_simple(lock, func):
@wraps(func)
def ans(*args, **kwargs):
with lock:
return func(*args, **kwargs)
return ans
class Cache(object):
def __init__(self, backend):
self.backend = backend
self.fields = {}
self.composites = set()
self.read_lock, self.write_lock = create_locks()
self.record_lock = RecordLock(self.read_lock)
self.format_metadata_cache = defaultdict(dict)
# Implement locking for all simple read/write API methods
# An unlocked version of the method is stored with the name starting
# with a leading underscore. Use the unlocked versions when the lock
# has already been acquired.
for name in dir(self):
func = getattr(self, name)
ira = getattr(func, 'is_read_api', None)
if ira is not None:
# Save original function
setattr(self, '_'+name, func)
# Wrap it in a lock
lock = self.read_lock if ira else self.write_lock
setattr(self, name, wrap_simple(lock, func))
@property
def field_metadata(self):
return self.backend.field_metadata
def _format_abspath(self, book_id, fmt):
'''
Return absolute path to the ebook file of format `format`
WARNING: This method will return a dummy path for a network backend DB,
so do not rely on it, use format(..., as_path=True) instead.
Currently used only in calibredb list, the viewer and the catalogs (via
get_data_as_dict()).
Apart from the viewer, I don't believe any of the others do any file
I/O with the results of this call.
'''
try:
name = self.fields['formats'].format_fname(book_id, fmt)
path = self._field_for('path', book_id).replace('/', os.sep)
except:
return None
if name and path:
return self.backend.format_abspath(book_id, fmt, name, path)
def _get_metadata(self, book_id, get_user_categories=True): # {{{
mi = Metadata(None)
author_ids = self._field_ids_for('authors', book_id)
aut_list = [self._author_data(i) for i in author_ids]
aum = []
aus = {}
aul = {}
for rec in aut_list:
aut = rec['name']
aum.append(aut)
aus[aut] = rec['sort']
aul[aut] = rec['link']
mi.title = self._field_for('title', book_id,
default_value=_('Unknown'))
mi.authors = aum
mi.author_sort = self._field_for('author_sort', book_id,
default_value=_('Unknown'))
mi.author_sort_map = aus
mi.author_link_map = aul
mi.comments = self._field_for('comments', book_id)
mi.publisher = self._field_for('publisher', book_id)
n = now()
mi.timestamp = self._field_for('timestamp', book_id, default_value=n)
mi.pubdate = self._field_for('pubdate', book_id, default_value=n)
mi.uuid = self._field_for('uuid', book_id,
default_value='dummy')
mi.title_sort = self._field_for('sort', book_id,
default_value=_('Unknown'))
mi.book_size = self._field_for('size', book_id, default_value=0)
mi.ondevice_col = self._field_for('ondevice', book_id, default_value='')
mi.last_modified = self._field_for('last_modified', book_id,
default_value=n)
formats = self._field_for('formats', book_id)
mi.format_metadata = {}
if not formats:
formats = None
else:
for f in formats:
mi.format_metadata[f] = self._format_metadata(book_id, f)
formats = ','.join(formats)
mi.formats = formats
mi.has_cover = _('Yes') if self._field_for('cover', book_id,
default_value=False) else ''
mi.tags = list(self._field_for('tags', book_id, default_value=()))
mi.series = self._field_for('series', book_id)
if mi.series:
mi.series_index = self._field_for('series_index', book_id,
default_value=1.0)
mi.rating = self._field_for('rating', book_id)
mi.set_identifiers(self._field_for('identifiers', book_id,
default_value={}))
mi.application_id = book_id
mi.id = book_id
composites = {}
for key, meta in self.field_metadata.custom_iteritems():
mi.set_user_metadata(key, meta)
if meta['datatype'] == 'composite':
composites.append(key)
else:
mi.set(key, val=self._field_for(meta['label'], book_id),
extra=self._field_for(meta['label']+'_index', book_id))
for c in composites:
mi.set(key, val=self._composite_for(key, book_id, mi))
user_cat_vals = {}
if get_user_categories:
user_cats = self.prefs['user_categories']
for ucat in user_cats:
res = []
for name,cat,ign in user_cats[ucat]:
v = mi.get(cat, None)
if isinstance(v, list):
if name in v:
res.append([name,cat])
elif name == v:
res.append([name,cat])
user_cat_vals[ucat] = res
mi.user_categories = user_cat_vals
return mi
# }}}
# Cache Layer API {{{
@api
def init(self):
'''
Initialize this cache with data from the backend.
'''
with self.write_lock:
self.backend.read_tables()
for field, table in self.backend.tables.iteritems():
self.fields[field] = create_field(field, table)
if table.metadata['datatype'] == 'composite':
self.composites.add(field)
self.fields['ondevice'] = create_field('ondevice', None)
@read_api
def field_for(self, name, book_id, default_value=None):
'''
Return the value of the field ``name`` for the book identified by
``book_id``. If no such book exists or it has no defined value for the
field ``name`` or no such field exists, then ``default_value`` is returned.
The returned value for is_multiple fields are always tuples.
'''
if self.composites and name in self.composites:
return self.composite_for(name, book_id,
default_value=default_value)
try:
return self.fields[name].for_book(book_id, default_value=default_value)
except (KeyError, IndexError):
return default_value
@read_api
def composite_for(self, name, book_id, mi=None, default_value=''):
try:
f = self.fields[name]
except KeyError:
return default_value
if mi is None:
return f.get_value_with_cache(book_id, partial(self._get_metadata,
get_user_categories=False))
else:
return f.render_composite(book_id, mi)
@read_api
def field_ids_for(self, name, book_id):
'''
Return the ids (as a tuple) for the values that the field ``name`` has on the book
identified by ``book_id``. If there are no values, or no such book, or
no such field, an empty tuple is returned.
'''
try:
return self.fields[name].ids_for_book(book_id)
except (KeyError, IndexError):
return ()
@read_api
def books_for_field(self, name, item_id):
'''
Return all the books associated with the item identified by
``item_id``, where the item belongs to the field ``name``.
Returned value is a tuple of book ids, or the empty tuple if the item
or the field does not exist.
'''
try:
return self.fields[name].books_for(item_id)
except (KeyError, IndexError):
return ()
@read_api
def all_book_ids(self):
'''
Frozen set of all known book ids.
'''
return frozenset(self.fields['uuid'].iter_book_ids())
@read_api
def all_field_ids(self, name):
'''
Frozen set of ids for all values in the field ``name``.
'''
return frozenset(iter(self.fields[name]))
@read_api
def author_data(self, author_id):
'''
Return author data as a dictionary with keys: name, sort, link
If no author with the specified id is found an empty dictionary is
returned.
'''
try:
return self.fields['authors'].author_data(author_id)
except (KeyError, IndexError):
return {}
@read_api
def format_metadata(self, book_id, fmt, allow_cache=True):
if not fmt:
return {}
fmt = fmt.upper()
if allow_cache:
x = self.format_metadata_cache[book_id].get(fmt, None)
if x is not None:
return x
try:
name = self.fields['formats'].format_fname(book_id, fmt)
path = self._field_for('path', book_id).replace('/', os.sep)
except:
return {}
ans = {}
if path and name:
ans = self.backend.format_metadata(book_id, fmt, name, path)
self.format_metadata_cache[book_id][fmt] = ans
return ans
@api
def get_metadata(self, book_id,
get_cover=False, get_user_categories=True, cover_as_data=False):
'''
Return metadata for the book identified by book_id as a :class:`Metadata` object.
Note that the list of formats is not verified. If get_cover is True,
the cover is returned, either a path to temp file as mi.cover or if
cover_as_data is True then as mi.cover_data.
'''
with self.read_lock:
mi = self._get_metadata(book_id, get_user_categories=get_user_categories)
if get_cover:
if cover_as_data:
cdata = self.cover(book_id)
if cdata:
mi.cover_data = ('jpeg', cdata)
else:
mi.cover = self.cover(book_id, as_path=True)
return mi
@api
def cover(self, book_id,
as_file=False, as_image=False, as_path=False):
'''
Return the cover image or None. By default, returns the cover as a
bytestring.
WARNING: Using as_path will copy the cover to a temp file and return
the path to the temp file. You should delete the temp file when you are
done with it.
:param as_file: If True return the image as an open file object (a SpooledTemporaryFile)
:param as_image: If True return the image as a QImage object
:param as_path: If True return the image as a path pointing to a
temporary file
'''
with self.read_lock:
try:
path = self._field_for('path', book_id).replace('/', os.sep)
except:
return None
with self.record_lock.lock(book_id):
return self.backend.cover(path, as_file=as_file, as_image=as_image,
as_path=as_path)
@read_api
def multisort(self, fields):
all_book_ids = frozenset(self._all_book_ids())
get_metadata = partial(self._get_metadata, get_user_categories=False)
sort_keys = tuple(self.fields[field[0]].sort_keys_for_books(get_metadata,
all_book_ids) for field in fields)
if len(sort_keys) == 1:
sk = sort_keys[0]
return sorted(all_book_ids, key=lambda i:sk[i], reverse=not
fields[1])
else:
return sorted(all_book_ids, key=partial(SortKey, fields, sort_keys))
# }}}
class SortKey(object):
def __init__(self, fields, sort_keys, book_id):
self.orders = tuple(1 if f[1] else -1 for f in fields)
self.sort_key = tuple(sk[book_id] for sk in sort_keys)
def __cmp__(self, other):
for i, order in enumerate(self.orders):
ans = cmp(self.sort_key[i], other.sort_key[i])
if ans != 0:
return ans * order
return 0
# Testing {{{
def test(library_path):
from calibre.db.backend import DB
backend = DB(library_path)
cache = Cache(backend)
cache.init()
print ('All book ids:', cache.all_book_ids())
if __name__ == '__main__':
from calibre.utils.config import prefs
test(prefs['library_path'])
# }}}

257
src/calibre/db/fields.py Normal file
View File

@ -0,0 +1,257 @@
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import (unicode_literals, division, absolute_import,
print_function)
from future_builtins import map
__license__ = 'GPL v3'
__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
from threading import Lock
from calibre.db.tables import ONE_ONE, MANY_ONE, MANY_MANY
from calibre.utils.icu import sort_key
class Field(object):
def __init__(self, name, table):
self.name, self.table = name, table
self.has_text_data = self.metadata['datatype'] in ('text', 'comments',
'series', 'enumeration')
self.table_type = self.table.table_type
dt = self.metadata['datatype']
self._sort_key = (sort_key if dt == 'text' else lambda x: x)
@property
def metadata(self):
return self.table.metadata
def for_book(self, book_id, default_value=None):
'''
Return the value of this field for the book identified by book_id.
When no value is found, returns ``default_value``.
'''
raise NotImplementedError()
def ids_for_book(self, book_id):
'''
Return a tuple of items ids for items associated with the book
identified by book_ids. Returns an empty tuple if no such items are
found.
'''
raise NotImplementedError()
def books_for(self, item_id):
'''
Return the ids of all books associated with the item identified by
item_id as a tuple. An empty tuple is returned if no books are found.
'''
raise NotImplementedError()
def __iter__(self):
'''
Iterate over the ids for all values in this field
'''
raise NotImplementedError()
def sort_keys_for_books(self, get_metadata, all_book_ids):
'''
Return a mapping of book_id -> sort_key. The sort key is suitable for
use in sorting the list of all books by this field, via the python cmp
method.
'''
raise NotImplementedError()
class OneToOneField(Field):
def for_book(self, book_id, default_value=None):
return self.table.book_col_map.get(book_id, default_value)
def ids_for_book(self, book_id):
return (book_id,)
def books_for(self, item_id):
return (item_id,)
def __iter__(self):
return self.table.book_col_map.iterkeys()
def iter_book_ids(self):
return self.table.book_col_map.iterkeys()
def sort_keys_for_books(self, get_metadata, all_book_ids):
return {id_ : self._sort_key(self.book_col_map.get(id_, '')) for id_ in
all_book_ids}
class CompositeField(OneToOneField):
def __init__(self, *args, **kwargs):
OneToOneField.__init__(self, *args, **kwargs)
self._render_cache = {}
self._lock = Lock()
def render_composite(self, book_id, mi):
with self._lock:
ans = self._render_cache.get(book_id, None)
if ans is None:
ans = mi.get(self.metadata['label'])
with self._lock:
self._render_cache[book_id] = ans
return ans
def clear_cache(self):
with self._lock:
self._render_cache = {}
def pop_cache(self, book_id):
with self._lock:
self._render_cache.pop(book_id, None)
def get_value_with_cache(self, book_id, get_metadata):
with self._lock:
ans = self._render_cache.get(book_id, None)
if ans is None:
mi = get_metadata(book_id)
ans = mi.get(self.metadata['label'])
return ans
def sort_keys_for_books(self, get_metadata, all_book_ids):
return {id_ : sort_key(self.get_value_with_cache(id_, get_metadata)) for id_ in
all_book_ids}
class OnDeviceField(OneToOneField):
def __init__(self, name, table):
self.name = name
self.book_on_device_func = None
def book_on_device(self, book_id):
if callable(self.book_on_device_func):
return self.book_on_device_func(book_id)
return None
def set_book_on_device_func(self, func):
self.book_on_device_func = func
def for_book(self, book_id, default_value=None):
loc = []
count = 0
on = self.book_on_device(book_id)
if on is not None:
m, a, b, count = on[:4]
if m is not None:
loc.append(_('Main'))
if a is not None:
loc.append(_('Card A'))
if b is not None:
loc.append(_('Card B'))
return ', '.join(loc) + ((' (%s books)'%count) if count > 1 else '')
def __iter__(self):
return iter(())
def iter_book_ids(self):
return iter(())
def sort_keys_for_books(self, get_metadata, all_book_ids):
return {id_ : self.for_book(id_) for id_ in
all_book_ids}
class ManyToOneField(Field):
def for_book(self, book_id, default_value=None):
ids = self.table.book_col_map.get(book_id, None)
if ids is not None:
ans = self.id_map[ids]
else:
ans = default_value
return ans
def ids_for_book(self, book_id):
id_ = self.table.book_col_map.get(book_id, None)
if id_ is None:
return ()
return (id_,)
def books_for(self, item_id):
return self.table.col_book_map.get(item_id, ())
def __iter__(self):
return self.table.id_map.iterkeys()
def sort_keys_for_books(self, get_metadata, all_book_ids):
keys = {id_ : self._sort_key(self.id_map.get(id_, '')) for id_ in
all_book_ids}
return {id_ : keys.get(
self.book_col_map.get(id_, None), '') for id_ in all_book_ids}
class ManyToManyField(Field):
def __init__(self, *args, **kwargs):
Field.__init__(self, *args, **kwargs)
self.alphabetical_sort = self.name != 'authors'
def for_book(self, book_id, default_value=None):
ids = self.table.book_col_map.get(book_id, ())
if ids:
ans = tuple(self.id_map[i] for i in ids)
else:
ans = default_value
return ans
def ids_for_book(self, book_id):
return self.table.book_col_map.get(book_id, ())
def books_for(self, item_id):
return self.table.col_book_map.get(item_id, ())
def __iter__(self):
return self.table.id_map.iterkeys()
def sort_keys_for_books(self, get_metadata, all_book_ids):
keys = {id_ : self._sort_key(self.id_map.get(id_, '')) for id_ in
all_book_ids}
def sort_key_for_book(book_id):
item_ids = self.table.book_col_map.get(book_id, ())
if self.alphabetical_sort:
item_ids = sorted(item_ids, key=keys.get)
return tuple(map(keys.get, item_ids))
return {id_ : sort_key_for_book(id_) for id_ in all_book_ids}
class AuthorsField(ManyToManyField):
def author_data(self, author_id):
return {
'name' : self.table.id_map[author_id],
'sort' : self.table.asort_map[author_id],
'link' : self.table.alink_map[author_id],
}
class FormatsField(ManyToManyField):
def format_fname(self, book_id, fmt):
return self.table.fname_map[book_id][fmt.upper()]
def create_field(name, table):
cls = {
ONE_ONE : OneToOneField,
MANY_ONE : ManyToOneField,
MANY_MANY : ManyToManyField,
}[table.table_type]
if name == 'authors':
cls = AuthorsField
elif name == 'ondevice':
cls = OnDeviceField
elif name == 'formats':
cls = FormatsField
elif table.metadata['datatype'] == 'composite':
cls = CompositeField
return cls(name, table)

View File

@ -7,7 +7,9 @@ __license__ = 'GPL v3'
__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
from threading import Lock, Condition, current_thread
from threading import Lock, Condition, current_thread, RLock
from functools import partial
from collections import Counter
class LockingError(RuntimeError):
pass
@ -37,7 +39,7 @@ def create_locks():
l = SHLock()
return RWLockWrapper(l), RWLockWrapper(l, is_shared=False)
class SHLock(object):
class SHLock(object): # {{{
'''
Shareable lock class. Used to implement the Multiple readers-single writer
paradigm. As best as I can tell, neither writer nor reader starvation
@ -79,6 +81,11 @@ class SHLock(object):
return self._acquire_exclusive(blocking)
assert not (self.is_shared and self.is_exclusive)
def owns_lock(self):
me = current_thread()
with self._lock:
return self._exclusive_owner is me or me in self._shared_owners
def release(self):
''' Release the lock. '''
# This decrements the appropriate lock counters, and if the lock
@ -189,6 +196,8 @@ class SHLock(object):
def _return_waiter(self, waiter):
self._free_waiters.append(waiter)
# }}}
class RWLockWrapper(object):
def __init__(self, shlock, is_shared=True):
@ -200,16 +209,124 @@ class RWLockWrapper(object):
return self
def __exit__(self, *args):
self.release()
def release(self):
self._shlock.release()
def owns_lock(self):
return self._shlock.owns_lock()
class RecordLock(object):
'''
Lock records identified by hashable ids. To use
rl = RecordLock()
with rl.lock(some_id):
# do something
This will lock the record identified by some_id exclusively. The lock is
recursive, which means that you can lock the same record multiple times in
the same thread.
This class co-operates with the SHLock class. If you try to lock a record
in a thread that already holds the SHLock, a LockingError is raised. This
is to prevent the possibility of a cross-lock deadlock.
A cross-lock deadlock is still possible if you first lock a record and then
acquire the SHLock, but the usage pattern for this lock makes this highly
unlikely (this lock should be acquired immediately before any file I/O on
files in the library and released immediately after).
'''
class Wrap(object):
def __init__(self, release):
self.release = release
def __enter__(self):
return self
def __exit__(self, *args, **kwargs):
self.release()
self.release = None
def __init__(self, sh_lock):
self._lock = Lock()
# This is for recycling lock objects.
self._free_locks = [RLock()]
self._records = {}
self._counter = Counter()
self.sh_lock = sh_lock
def lock(self, record_id):
if self.sh_lock.owns_lock():
raise LockingError('Current thread already holds a shared lock,'
' you cannot also ask for record lock as this could cause a'
' deadlock.')
with self._lock:
l = self._records.get(record_id, None)
if l is None:
l = self._take_lock()
self._records[record_id] = l
self._counter[record_id] += 1
l.acquire()
return RecordLock.Wrap(partial(self.release, record_id))
def release(self, record_id):
with self._lock:
l = self._records.pop(record_id, None)
if l is None:
raise LockingError('No lock acquired for record %r'%record_id)
l.release()
self._counter[record_id] -= 1
if self._counter[record_id] > 0:
self._records[record_id] = l
else:
self._return_lock(l)
def _take_lock(self):
try:
return self._free_locks.pop()
except IndexError:
return RLock()
def _return_lock(self, lock):
self._free_locks.append(lock)
# Tests {{{
if __name__ == '__main__':
import time, random, unittest
from threading import Thread
class TestSHLock(unittest.TestCase):
"""Testcases for SHLock class."""
class TestLock(unittest.TestCase):
"""Testcases for Lock classes."""
def test_owns_locks(self):
lock = SHLock()
self.assertFalse(lock.owns_lock())
lock.acquire(shared=True)
self.assertTrue(lock.owns_lock())
lock.release()
self.assertFalse(lock.owns_lock())
lock.acquire(shared=False)
self.assertTrue(lock.owns_lock())
lock.release()
self.assertFalse(lock.owns_lock())
done = []
def test():
if not lock.owns_lock():
done.append(True)
lock.acquire()
t = Thread(target=test)
t.daemon = True
t.start()
t.join(1)
self.assertEqual(len(done), 1)
lock.release()
def test_multithread_deadlock(self):
lock = SHLock()
@ -345,8 +462,38 @@ if __name__ == '__main__':
self.assertFalse(lock.is_shared)
self.assertFalse(lock.is_exclusive)
def test_record_lock(self):
shlock = SHLock()
lock = RecordLock(shlock)
suite = unittest.TestLoader().loadTestsFromTestCase(TestSHLock)
shlock.acquire()
self.assertRaises(LockingError, lock.lock, 1)
shlock.release()
with lock.lock(1):
with lock.lock(1):
pass
def dolock():
with lock.lock(1):
time.sleep(0.1)
t = Thread(target=dolock)
t.daemon = True
with lock.lock(1):
t.start()
t.join(0.2)
self.assertTrue(t.is_alive())
t.join(0.11)
self.assertFalse(t.is_alive())
t = Thread(target=dolock)
t.daemon = True
with lock.lock(2):
t.start()
t.join(0.11)
self.assertFalse(t.is_alive())
suite = unittest.TestLoader().loadTestsFromTestCase(TestLock)
unittest.TextTestRunner(verbosity=2).run(suite)
# }}}

View File

@ -12,11 +12,13 @@ from datetime import datetime
from dateutil.tz import tzoffset
from calibre.constants import plugins
from calibre.utils.date import parse_date, local_tz
from calibre.utils.date import parse_date, local_tz, UNDEFINED_DATE
from calibre.ebooks.metadata import author_to_author_sort
_c_speedup = plugins['speedup'][0]
ONE_ONE, MANY_ONE, MANY_MANY = xrange(3)
def _c_convert_timestamp(val):
if not val:
return None
@ -27,8 +29,11 @@ def _c_convert_timestamp(val):
if ret is None:
return parse_date(val, as_utc=False)
year, month, day, hour, minutes, seconds, tzsecs = ret
try:
return datetime(year, month, day, hour, minutes, seconds,
tzinfo=tzoffset(None, tzsecs)).astimezone(local_tz)
except OverflowError:
return UNDEFINED_DATE.astimezone(local_tz)
class Table(object):
@ -57,6 +62,8 @@ class OneToOneTable(Table):
timestamp, size, etc.
'''
table_type = ONE_ONE
def read(self, db):
self.book_col_map = {}
idcol = 'id' if self.metadata['table'] == 'books' else 'book'
@ -73,6 +80,17 @@ class SizeTable(OneToOneTable):
'WHERE data.book=books.id) FROM books'):
self.book_col_map[row[0]] = self.unserialize(row[1])
class CompositeTable(OneToOneTable):
def read(self, db):
self.book_col_map = {}
d = self.metadata['display']
self.composite_template = ['composite_template']
self.contains_html = d['contains_html']
self.make_category = d['make_category']
self.composite_sort = d['composite_sort']
self.use_decorations = d['use_decorations']
class ManyToOneTable(Table):
'''
@ -82,9 +100,10 @@ class ManyToOneTable(Table):
Each book however has only one value for data of this type.
'''
table_type = MANY_ONE
def read(self, db):
self.id_map = {}
self.extra_map = {}
self.col_book_map = {}
self.book_col_map = {}
self.read_id_maps(db)
@ -105,6 +124,9 @@ class ManyToOneTable(Table):
self.col_book_map[row[1]].append(row[0])
self.book_col_map[row[0]] = row[1]
for key in tuple(self.col_book_map.iterkeys()):
self.col_book_map[key] = tuple(self.col_book_map[key])
class ManyToManyTable(ManyToOneTable):
'''
@ -113,6 +135,8 @@ class ManyToManyTable(ManyToOneTable):
book. For example: tags or authors.
'''
table_type = MANY_MANY
def read_maps(self, db):
for row in db.conn.execute(
'SELECT book, {0} FROM {1}'.format(
@ -124,14 +148,21 @@ class ManyToManyTable(ManyToOneTable):
self.book_col_map[row[0]] = []
self.book_col_map[row[0]].append(row[1])
for key in tuple(self.col_book_map.iterkeys()):
self.col_book_map[key] = tuple(self.col_book_map[key])
for key in tuple(self.book_col_map.iterkeys()):
self.book_col_map[key] = tuple(self.book_col_map[key])
class AuthorsTable(ManyToManyTable):
def read_id_maps(self, db):
self.alink_map = {}
self.asort_map = {}
for row in db.conn.execute(
'SELECT id, name, sort, link FROM authors'):
self.id_map[row[0]] = row[1]
self.extra_map[row[0]] = (row[2] if row[2] else
self.asort_map[row[0]] = (row[2] if row[2] else
author_to_author_sort(row[1]))
self.alink_map[row[0]] = row[3]
@ -141,14 +172,25 @@ class FormatsTable(ManyToManyTable):
pass
def read_maps(self, db):
self.fname_map = {}
for row in db.conn.execute('SELECT book, format, name FROM data'):
if row[1] is not None:
if row[1] not in self.col_book_map:
self.col_book_map[row[1]] = []
self.col_book_map[row[1]].append(row[0])
fmt = row[1].upper()
if fmt not in self.col_book_map:
self.col_book_map[fmt] = []
self.col_book_map[fmt].append(row[0])
if row[0] not in self.book_col_map:
self.book_col_map[row[0]] = []
self.book_col_map[row[0]].append((row[1], row[2]))
self.book_col_map[row[0]].append(fmt)
if row[0] not in self.fname_map:
self.fname_map[row[0]] = {}
self.fname_map[row[0]][fmt] = row[2]
for key in tuple(self.col_book_map.iterkeys()):
self.col_book_map[key] = tuple(self.col_book_map[key])
for key in tuple(self.book_col_map.iterkeys()):
self.book_col_map[key] = tuple(self.book_col_map[key])
class IdentifiersTable(ManyToManyTable):
@ -162,6 +204,9 @@ class IdentifiersTable(ManyToManyTable):
self.col_book_map[row[1]] = []
self.col_book_map[row[1]].append(row[0])
if row[0] not in self.book_col_map:
self.book_col_map[row[0]] = []
self.book_col_map[row[0]].append((row[1], row[2]))
self.book_col_map[row[0]] = {}
self.book_col_map[row[0]][row[1]] = row[2]
for key in tuple(self.col_book_map.iterkeys()):
self.col_book_map[key] = tuple(self.col_book_map[key])

109
src/calibre/db/view.py Normal file
View File

@ -0,0 +1,109 @@
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import (unicode_literals, division, absolute_import,
print_function)
__license__ = 'GPL v3'
__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
from functools import partial
def sanitize_sort_field_name(field_metadata, field):
field = field_metadata.search_term_to_field_key(field.lower().strip())
# translate some fields to their hidden equivalent
field = {'title': 'sort', 'authors':'author_sort'}.get(field, field)
return field
class View(object):
def __init__(self, cache):
self.cache = cache
self.marked_ids = {}
self._field_getters = {}
for col, idx in cache.backend.FIELD_MAP.iteritems():
if isinstance(col, int):
label = self.cache.backend.custom_column_num_map[col]['label']
label = (self.cache.backend.field_metadata.custom_field_prefix
+ label)
self._field_getters[idx] = partial(self.get, label)
else:
try:
self._field_getters[idx] = {
'id' : self._get_id,
'au_map' : self.get_author_data,
'ondevice': self.get_ondevice,
'marked' : self.get_marked,
}[col]
except KeyError:
self._field_getters[idx] = partial(self.get, col)
self._map = list(self.cache.all_book_ids())
self._map_filtered = list(self._map)
@property
def field_metadata(self):
return self.cache.field_metadata
def _get_id(self, idx, index_is_id=True):
ans = idx if index_is_id else self.index_to_id(idx)
return ans
def get_field_map_field(self, row, col, index_is_id=True):
'''
Supports the legacy FIELD_MAP interface for getting metadata. Do not use
in new code.
'''
getter = self._field_getters[col]
return getter(row, index_is_id=index_is_id)
def index_to_id(self, idx):
return self._map_filtered[idx]
def get(self, field, idx, index_is_id=True, default_value=None):
id_ = idx if index_is_id else self.index_to_id(idx)
return self.cache.field_for(field, id_)
def get_ondevice(self, idx, index_is_id=True, default_value=''):
id_ = idx if index_is_id else self.index_to_id(idx)
self.cache.field_for('ondevice', id_, default_value=default_value)
def get_marked(self, idx, index_is_id=True, default_value=None):
id_ = idx if index_is_id else self.index_to_id(idx)
return self.marked_ids.get(id_, default_value)
def get_author_data(self, idx, index_is_id=True, default_value=()):
'''
Return author data for all authors of the book identified by idx as a
tuple of dictionaries. The dictionaries should never be empty, unless
there is a bug somewhere. The list could be empty if idx point to an
non existent book, or book with no authors (though again a book with no
authors should never happen).
Each dictionary has the keys: name, sort, link. Link can be an empty
string.
default_value is ignored, this method always returns a tuple
'''
id_ = idx if index_is_id else self.index_to_id(idx)
with self.cache.read_lock:
ids = self.cache._field_ids_for('authors', id_)
ans = []
for id_ in ids:
ans.append(self.cache._author_data(id_))
return tuple(ans)
def multisort(self, fields=[], subsort=False):
fields = [(sanitize_sort_field_name(self.field_metadata, x), bool(y)) for x, y in fields]
keys = self.field_metadata.sortable_field_keys()
fields = [x for x in fields if x[0] in keys]
if subsort and 'sort' not in [x[0] for x in fields]:
fields += [('sort', True)]
if not fields:
fields = [('timestamp', False)]
sorted_book_ids = self.cache.multisort(fields)
sorted_book_ids
# TODO: change maps

View File

@ -39,7 +39,7 @@ class ANDROID(USBMS):
0x22b8 : { 0x41d9 : [0x216], 0x2d61 : [0x100], 0x2d67 : [0x100],
0x41db : [0x216], 0x4285 : [0x216], 0x42a3 : [0x216],
0x4286 : [0x216], 0x42b3 : [0x216], 0x42b4 : [0x216],
0x7086 : [0x0226], 0x70a8: [0x9999],
0x7086 : [0x0226], 0x70a8: [0x9999], 0x42c4 : [0x216],
},
# Sony Ericsson
@ -47,10 +47,12 @@ class ANDROID(USBMS):
# Google
0x18d1 : {
0x0001 : [0x0223],
0x4e11 : [0x0100, 0x226, 0x227],
0x4e12 : [0x0100, 0x226, 0x227],
0x4e21 : [0x0100, 0x226, 0x227],
0xb058: [0x0222, 0x226, 0x227]},
0xb058 : [0x0222, 0x226, 0x227]
},
# Samsung
0x04e8 : { 0x681d : [0x0222, 0x0223, 0x0224, 0x0400],
@ -60,6 +62,7 @@ class ANDROID(USBMS):
0x685e : [0x0400],
0x6860 : [0x0400],
0x6877 : [0x0400],
0x689e : [0x0400],
},
# Viewsonic
@ -124,7 +127,8 @@ class ANDROID(USBMS):
'IDEOS_TABLET', 'MYTOUCH_4G', 'UMS_COMPOSITE', 'SCH-I800_CARD',
'7', 'A956', 'A955', 'A43', 'ANDROID_PLATFORM', 'TEGRA_2',
'MB860', 'MULTI-CARD', 'MID7015A', 'INCREDIBLE', 'A7EB', 'STREAK',
'MB525', 'ANDROID2.3', 'SGH-I997']
'MB525', 'ANDROID2.3', 'SGH-I997', 'GT-I5800_CARD', 'MB612',
'GT-S5830_CARD', 'GT-S5570_CARD']
WINDOWS_CARD_A_MEM = ['ANDROID_PHONE', 'GT-I9000_CARD', 'SGH-I897',
'FILE-STOR_GADGET', 'SGH-T959', 'SAMSUNG_ANDROID', 'GT-P1000_CARD',
'A70S', 'A101IT', '7', 'INCREDIBLE', 'A7EB', 'SGH-T849_CARD',

View File

@ -35,9 +35,9 @@ class EB600(USBMS):
PRODUCT_ID = [0x1688]
BCD = [0x110]
VENDOR_NAME = ['NETRONIX', 'WOLDER']
WINDOWS_MAIN_MEM = ['EBOOK', 'MIBUK_GAMMA_6.2']
WINDOWS_CARD_A_MEM = 'EBOOK'
VENDOR_NAME = ['NETRONIX', 'WOLDER', 'MD86371']
WINDOWS_MAIN_MEM = ['EBOOK', 'MIBUK_GAMMA_6.2', 'MD86371']
WINDOWS_CARD_A_MEM = ['EBOOK', 'MD86371']
OSX_MAIN_MEM = 'EB600 Internal Storage Media'
OSX_CARD_A_MEM = 'EB600 Card Storage Media'

View File

@ -131,7 +131,7 @@ class AZBOOKA(ALEX):
description = _('Communicate with the Azbooka')
VENDOR_NAME = 'LINUX'
WINDOWS_MAIN_MEM = 'FILE-STOR_GADGET'
WINDOWS_MAIN_MEM = WINDOWS_CARD_A_MEM = 'FILE-STOR_GADGET'
MAIN_MEMORY_VOLUME_LABEL = 'Azbooka Internal Memory'

View File

@ -7,6 +7,7 @@ __docformat__ = 'restructuredtext en'
import os
import sqlite3 as sqlite
from contextlib import closing
from calibre.devices.usbms.books import BookList
from calibre.devices.kobo.books import Book
@ -22,7 +23,7 @@ class KOBO(USBMS):
gui_name = 'Kobo Reader'
description = _('Communicate with the Kobo Reader')
author = 'Timothy Legge'
version = (1, 0, 9)
version = (1, 0, 10)
dbversion = 0
fwversion = 0
@ -48,12 +49,16 @@ class KOBO(USBMS):
VIRTUAL_BOOK_EXTENSIONS = frozenset(['kobo'])
EXTRA_CUSTOMIZATION_MESSAGE = _('The Kobo supports only one collection '
'currently: the \"Im_Reading\" list. Create a tag called \"Im_Reading\" ')+\
'for automatic management'
EXTRA_CUSTOMIZATION_MESSAGE = [
_('The Kobo supports several collections including ')+\
'Read, Closed, Im_Reading ' +\
_('Create tags for automatic management'),
]
EXTRA_CUSTOMIZATION_DEFAULT = ', '.join(['tags'])
OPT_COLLECTIONS = 0
def initialize(self):
USBMS.initialize(self)
self.book_class = Book
@ -188,7 +193,9 @@ class KOBO(USBMS):
traceback.print_exc()
return changed
connection = sqlite.connect(self.normalize_path(self._main_prefix + '.kobo/KoboReader.sqlite'))
with closing(sqlite.connect(
self.normalize_path(self._main_prefix +
'.kobo/KoboReader.sqlite'))) as connection:
# return bytestrings if the content cannot the decoded as unicode
connection.text_factory = lambda x: unicode(x, "utf-8", "ignore")
@ -258,7 +265,6 @@ class KOBO(USBMS):
need_sync = True
cursor.close()
connection.close()
# Remove books that are no longer in the filesystem. Cache contains
# indices into the booklist if book not in filesystem, None otherwise
@ -288,7 +294,8 @@ class KOBO(USBMS):
# 2) content
debug_print('delete_via_sql: ContentID: ', ContentID, 'ContentType: ', ContentType)
connection = sqlite.connect(self.normalize_path(self._main_prefix + '.kobo/KoboReader.sqlite'))
with closing(sqlite.connect(self.normalize_path(self._main_prefix +
'.kobo/KoboReader.sqlite'))) as connection:
# return bytestrings if the content cannot the decoded as unicode
connection.text_factory = lambda x: unicode(x, "utf-8", "ignore")
@ -326,8 +333,14 @@ class KOBO(USBMS):
except Exception as e:
if 'no such column' not in str(e):
raise
try:
cursor.execute('update content set ReadStatus=0, FirstTimeReading = \'true\', ___PercentRead=0 ' \
'where BookID is Null and ContentID =?',t)
except Exception as e:
if 'no such column' not in str(e):
raise
cursor.execute('update content set ReadStatus=0, FirstTimeReading = \'true\' ' \
'where BookID is Null and ContentID =?',t)
connection.commit()
@ -337,7 +350,6 @@ class KOBO(USBMS):
print "Error condition ImageID was not found"
print "You likely tried to delete a book that the kobo has not yet added to the database"
connection.close()
# If all this succeeds we need to delete the images files via the ImageID
return ImageID
@ -664,7 +676,8 @@ class KOBO(USBMS):
# Needs to be outside books collection as in the case of removing
# the last book from the collection the list of books is empty
# and the removal of the last book would not occur
connection = sqlite.connect(self.normalize_path(self._main_prefix + '.kobo/KoboReader.sqlite'))
with closing(sqlite.connect(self.normalize_path(self._main_prefix +
'.kobo/KoboReader.sqlite'))) as connection:
# return bytestrings if the content cannot the decoded as unicode
connection.text_factory = lambda x: unicode(x, "utf-8", "ignore")
@ -693,10 +706,10 @@ class KOBO(USBMS):
if category in readstatuslist.keys():
# Manage ReadStatus
self.set_readstatus(connection, ContentID, readstatuslist.get(category))
if category == 'Shortlist' and self.dbversion >= 14:
elif category == 'Shortlist' and self.dbversion >= 14:
# Manage FavouritesIndex/Shortlist
self.set_favouritesindex(connection, ContentID)
if category in accessibilitylist.keys():
elif category in accessibilitylist.keys():
# Do not manage the Accessibility List
pass
else: # No collections
@ -707,8 +720,6 @@ class KOBO(USBMS):
debug_print("No Collections - reseting FavouritesIndex")
self.reset_favouritesindex(connection, oncard)
connection.close()
# debug_print('Finished update_device_database_collections', collections_attributes)
def sync_booklists(self, booklists, end_session=True):
@ -723,7 +734,7 @@ class KOBO(USBMS):
opts = self.settings()
if opts.extra_customization:
collections = [x.lower().strip() for x in
opts.extra_customization.split(',')]
opts.extra_customization[self.OPT_COLLECTIONS].split(',')]
else:
collections = []

View File

@ -351,3 +351,29 @@ class MOOVYBOOK(USBMS):
def get_main_ebook_dir(self, for_upload=False):
return 'Books' if for_upload else self.EBOOK_DIR_MAIN
class COBY(USBMS):
name = 'COBY MP977 device interface'
gui_name = 'COBY'
description = _('Communicate with the COBY')
author = 'Kovid Goyal'
supported_platforms = ['windows', 'osx', 'linux']
# Ordered list of supported formats
FORMATS = ['epub', 'pdf']
VENDOR_ID = [0x1e74]
PRODUCT_ID = [0x7121]
BCD = [0x02]
VENDOR_NAME = 'USB_2.0'
WINDOWS_MAIN_MEM = WINDOWS_CARD_A_MEM = 'MP977_DRIVER'
EBOOK_DIR_MAIN = ''
SUPPORTS_SUB_DIRS = False
def get_carda_ebook_dir(self, for_upload=False):
if for_upload:
return 'eBooks'
return self.EBOOK_DIR_CARD_A

View File

@ -1077,8 +1077,13 @@ class Device(DeviceConfig, DevicePlugin):
settings = self.settings()
template = self.save_template()
if mdata.tags and _('News') in mdata.tags:
try:
p = mdata.pubdate
date = (p.year, p.month, p.day)
except:
today = time.localtime()
template = "{title}_%d-%d-%d" % (today[0], today[1], today[2])
date = (today[0], today[1], today[2])
template = "{title}_%d-%d-%d" % date
use_subdirs = self.SUPPORTS_SUB_DIRS and settings.use_subdirs
fname = sanitize(fname)

View File

@ -94,11 +94,29 @@ class USBMS(CLI, Device):
self.report_progress(1.0, _('Get device information...'))
self.driveinfo = {}
if self._main_prefix is not None:
try:
self.driveinfo['main'] = self._update_driveinfo_file(self._main_prefix, 'main')
except (IOError, OSError) as e:
raise IOError(_('Failed to access files in the main memory of'
' your device. You should contact the device'
' manufacturer for support. Common fixes are:'
' try a different USB cable/USB port on your computer.'
' If you device has a "Reset to factory defaults" type'
' of setting somewhere, use it. Underlying error: %s')
% e)
try:
if self._card_a_prefix is not None:
self.driveinfo['A'] = self._update_driveinfo_file(self._card_a_prefix, 'A')
if self._card_b_prefix is not None:
self.driveinfo['B'] = self._update_driveinfo_file(self._card_b_prefix, 'B')
except (IOError, OSError) as e:
raise IOError(_('Failed to access files on the SD card in your'
' device. This can happen for many reasons. The SD card may be'
' corrupted, it may be too large for your device, it may be'
' write-protected, etc. Try a different SD card, or reformat'
' your SD card using the FAT32 filesystem. Also make sure'
' there are not too many files in the root of your SD card.'
' Underlying error: %s') % e)
return (self.get_gui_name(), '', '', '', self.driveinfo)
def set_driveinfo_name(self, location_code, name):

View File

@ -8,6 +8,7 @@ from various formats.
'''
import traceback, os, re
from cStringIO import StringIO
from calibre import CurrentDir
class ConversionError(Exception):
@ -159,7 +160,7 @@ def normalize(x):
return x
def calibre_cover(title, author_string, series_string=None,
output_format='jpg', title_size=46, author_size=36):
output_format='jpg', title_size=46, author_size=36, logo_path=None):
title = normalize(title)
author_string = normalize(author_string)
series_string = normalize(series_string)
@ -167,7 +168,9 @@ def calibre_cover(title, author_string, series_string=None,
lines = [TextLine(title, title_size), TextLine(author_string, author_size)]
if series_string:
lines.append(TextLine(series_string, author_size))
return create_cover_page(lines, I('library.png'), output_format='jpg')
if logo_path is None:
logo_path = I('library.png')
return create_cover_page(lines, logo_path, output_format='jpg')
UNIT_RE = re.compile(r'^(-*[0-9]*[.]?[0-9]*)\s*(%|em|ex|en|px|mm|cm|in|pt|pc)$')
@ -207,4 +210,45 @@ def unit_convert(value, base, font, dpi):
result = value * 0.40
return result
def generate_masthead(title, output_path=None, width=600, height=60):
from calibre.ebooks.conversion.config import load_defaults
from calibre.utils.fonts import fontconfig
font_path = default_font = P('fonts/liberation/LiberationSerif-Bold.ttf')
recs = load_defaults('mobi_output')
masthead_font_family = recs.get('masthead_font', 'Default')
if masthead_font_family != 'Default':
masthead_font = fontconfig.files_for_family(masthead_font_family)
# Assume 'normal' always in dict, else use default
# {'normal': (path_to_font, friendly name)}
if 'normal' in masthead_font:
font_path = masthead_font['normal'][0]
if not font_path or not os.access(font_path, os.R_OK):
font_path = default_font
try:
from PIL import Image, ImageDraw, ImageFont
Image, ImageDraw, ImageFont
except ImportError:
import Image, ImageDraw, ImageFont
img = Image.new('RGB', (width, height), 'white')
draw = ImageDraw.Draw(img)
try:
font = ImageFont.truetype(font_path, 48)
except:
font = ImageFont.truetype(default_font, 48)
text = title.encode('utf-8')
width, height = draw.textsize(text, font=font)
left = max(int((width - width)/2.), 0)
top = max(int((height - height)/2.), 0)
draw.text((left, top), text, fill=(0,0,0), font=font)
if output_path is None:
f = StringIO()
img.save(f, 'JPEG')
return f.getvalue()
else:
with open(output_path, 'wb') as f:
img.save(f, 'JPEG')

View File

@ -38,8 +38,12 @@ ENCODING_PATS = [
ENTITY_PATTERN = re.compile(r'&(\S+?);')
def strip_encoding_declarations(raw):
limit = 50*1024
for pat in ENCODING_PATS:
raw = pat.sub('', raw)
prefix = raw[:limit]
suffix = raw[limit:]
prefix = pat.sub('', prefix)
raw = prefix + suffix
return raw
def substitute_entites(raw):

View File

@ -137,7 +137,9 @@ def add_pipeline_options(parser, plumber):
'extra_css', 'smarten_punctuation',
'margin_top', 'margin_left', 'margin_right',
'margin_bottom', 'change_justification',
'insert_blank_line', 'remove_paragraph_spacing','remove_paragraph_spacing_indent_size',
'insert_blank_line', 'insert_blank_line_size',
'remove_paragraph_spacing',
'remove_paragraph_spacing_indent_size',
'asciiize',
]
),
@ -208,12 +210,13 @@ def add_pipeline_options(parser, plumber):
if rec.level < rec.HIGH:
option_recommendation_to_cli_option(add_option, rec)
parser.add_option('--list-recipes', default=False, action='store_true',
help=_('List builtin recipes'))
def option_parser():
return OptionParser(usage=USAGE)
parser = OptionParser(usage=USAGE)
parser.add_option('--list-recipes', default=False, action='store_true',
help=_('List builtin recipe names. You can create an ebook from '
'a builtin recipe like this: ebook-convert "Recipe Name.recipe" '
'output.epub'))
return parser
class ProgressBar(object):

View File

@ -366,9 +366,9 @@ OptionRecommendation(name='remove_paragraph_spacing',
OptionRecommendation(name='remove_paragraph_spacing_indent_size',
recommended_value=1.5, level=OptionRecommendation.LOW,
help=_('When calibre removes inter paragraph spacing, it automatically '
help=_('When calibre removes blank lines between paragraphs, it automatically '
'sets a paragraph indent, to ensure that paragraphs can be easily '
'distinguished. This option controls the width of that indent.')
'distinguished. This option controls the width of that indent (in em).')
),
OptionRecommendation(name='prefer_metadata_cover',
@ -384,6 +384,13 @@ OptionRecommendation(name='insert_blank_line',
)
),
OptionRecommendation(name='insert_blank_line_size',
recommended_value=0.5, level=OptionRecommendation.LOW,
help=_('Set the height of the inserted blank lines (in em).'
' The height of the lines between paragraphs will be twice the value'
' set here.')
),
OptionRecommendation(name='remove_first_image',
recommended_value=False, level=OptionRecommendation.LOW,
help=_('Remove the first image from the input ebook. Useful if the '
@ -602,7 +609,7 @@ OptionRecommendation(name='sr3_replace',
input_fmt = os.path.splitext(self.input)[1]
if not input_fmt:
raise ValueError('Input file must have an extension')
input_fmt = input_fmt[1:].lower()
input_fmt = input_fmt[1:].lower().replace('original_', '')
self.archive_input_tdir = None
if input_fmt in ARCHIVE_FMTS:
self.log('Processing archive...')
@ -1048,6 +1055,7 @@ OptionRecommendation(name='sr3_replace',
with self.output_plugin:
self.output_plugin.convert(self.oeb, self.output, self.input_plugin,
self.opts, self.log)
self.oeb.clean_temp_files()
self.ui_reporter(1.)
run_plugins_on_postprocess(self.output, self.output_fmt)

View File

@ -303,6 +303,9 @@ class CSSPreProcessor(object):
class HTMLPreProcessor(object):
PREPROCESS = [
# Remove huge block of contiguous spaces as they slow down
# the following regexes pretty badly
(re.compile(r'\s{10000,}'), lambda m: ''),
# Some idiotic HTML generators (Frontpage I'm looking at you)
# Put all sorts of crap into <head>. This messes up lxml
(re.compile(r'<head[^>]*>\n*(.*?)\n*</head>', re.IGNORECASE|re.DOTALL),

View File

@ -8,7 +8,7 @@ __docformat__ = 'restructuredtext en'
import os
from calibre import guess_type, walk
from calibre import guess_type
from calibre.customize.conversion import InputFormatPlugin
from calibre.ebooks.chardet import xml_to_unicode
from calibre.ebooks.metadata.opf2 import OPF
@ -25,16 +25,50 @@ class HTMLZInput(InputFormatPlugin):
accelerators):
self.log = log
html = u''
top_levels = []
# Extract content from zip archive.
zf = ZipFile(stream)
zf.extractall()
for x in walk('.'):
if os.path.splitext(x)[1].lower() in ('.html', '.xhtml', '.htm'):
with open(x, 'rb') as tf:
html = tf.read()
# Find the HTML file in the archive. It needs to be
# top level.
index = u''
multiple_html = False
# Get a list of all top level files in the archive.
for x in os.listdir('.'):
if os.path.isfile(x):
top_levels.append(x)
# Try to find an index. file.
for x in top_levels:
if x.lower() in ('index.html', 'index.xhtml', 'index.htm'):
index = x
break
# Look for multiple HTML files in the archive. We look at the
# top level files only as only they matter in HTMLZ.
for x in top_levels:
if os.path.splitext(x)[1].lower() in ('.html', '.xhtml', '.htm'):
# Set index to the first HTML file found if it's not
# called index.
if not index:
index = x
else:
multiple_html = True
# Warn the user if there multiple HTML file in the archive. HTMLZ
# supports a single HTML file. A conversion with a multiple HTML file
# HTMLZ archive probably won't turn out as the user expects. With
# Multiple HTML files ZIP input should be used in place of HTMLZ.
if multiple_html:
log.warn(_('Multiple HTML files found in the archive. Only %s will be used.') % index)
if index:
with open(index, 'rb') as tf:
html = tf.read()
else:
raise Exception(_('No top level HTML file found.'))
if not html:
raise Exception(_('Top level HTML file %s is empty') % index)
# Encoding
if options.input_encoding:
@ -75,7 +109,7 @@ class HTMLZInput(InputFormatPlugin):
# Get the cover path from the OPF.
cover_path = None
opf = None
for x in walk('.'):
for x in top_levels:
if os.path.splitext(x)[1].lower() in ('.opf'):
opf = x
break

View File

@ -742,7 +742,7 @@ class Metadata(object):
ans += [('ISBN', unicode(self.isbn))]
ans += [(_('Tags'), u', '.join([unicode(t) for t in self.tags]))]
if self.series:
ans += [_('Series'), unicode(self.series) + ' #%s'%self.format_series_index()]
ans += [(_('Series'), unicode(self.series) + ' #%s'%self.format_series_index())]
ans += [(_('Language'), unicode(self.language))]
if self.timestamp is not None:
ans += [(_('Timestamp'), unicode(self.timestamp.isoformat(' ')))]

View File

@ -24,10 +24,9 @@ XPath = partial(etree.XPath, namespaces=NAMESPACES)
tostring = partial(etree.tostring, method='text', encoding=unicode)
def get_metadata(stream):
""" Return fb2 metadata as a L{MetaInformation} object """
''' Return fb2 metadata as a L{MetaInformation} object '''
root = _get_fbroot(stream)
book_title = _parse_book_title(root)
authors = _parse_authors(root)
@ -181,6 +180,7 @@ def _parse_series(root, mi):
def _parse_isbn(root, mi):
# some people try to put several isbn in this field, but it is not allowed. try to stick to the 1-st one in this case
isbn = XPath('normalize-space(//fb2:publish-info/fb2:isbn/text())')(root)
if isbn:
# some people try to put several isbn in this field, but it is not allowed. try to stick to the 1-st one in this case
if ',' in isbn:
isbn = isbn[:isbn.index(',')]
@ -232,4 +232,3 @@ def _get_fbroot(stream):
raw = xml_to_unicode(raw, strip_encoding_pats=True)[0]
root = etree.fromstring(raw, parser=parser)
return root

View File

@ -22,6 +22,7 @@ from calibre.utils.date import parse_date, isoformat
from calibre.utils.localization import get_lang
from calibre import prints, guess_type
from calibre.utils.cleantext import clean_ascii_chars
from calibre.utils.config import tweaks
class Resource(object): # {{{
'''
@ -527,7 +528,12 @@ class OPF(object): # {{{
category = MetadataField('type')
rights = MetadataField('rights')
series = MetadataField('series', is_dc=False)
series_index = MetadataField('series_index', is_dc=False, formatter=float, none_is=1)
if tweaks['use_series_auto_increment_tweak_when_importing']:
series_index = MetadataField('series_index', is_dc=False,
formatter=float, none_is=None)
else:
series_index = MetadataField('series_index', is_dc=False,
formatter=float, none_is=1)
title_sort = TitleSortField('title_sort', is_dc=False)
rating = MetadataField('rating', is_dc=False, formatter=int)
pubdate = MetadataField('date', formatter=parse_date,
@ -1024,8 +1030,10 @@ class OPF(object): # {{{
attrib = attrib or {}
attrib['name'] = 'calibre:' + name
name = '{%s}%s' % (self.NAMESPACES['opf'], 'meta')
nsmap = dict(self.NAMESPACES)
del nsmap['opf']
elem = etree.SubElement(self.metadata, name, attrib=attrib,
nsmap=self.NAMESPACES)
nsmap=nsmap)
elem.tail = '\n'
return elem

View File

@ -22,6 +22,7 @@ from calibre.ebooks.metadata.book.base import Metadata
from calibre.utils.date import utc_tz, as_utc
from calibre.utils.html2text import html2text
from calibre.utils.icu import lower
from calibre.utils.date import UNDEFINED_DATE
# Download worker {{{
class Worker(Thread):
@ -490,6 +491,8 @@ def identify(log, abort, # {{{
max_tags = msprefs['max_tags']
for r in results:
r.tags = r.tags[:max_tags]
if getattr(r.pubdate, 'year', 2000) <= UNDEFINED_DATE.year:
r.pubdate = None
if msprefs['swap_author_names']:
for r in results:

View File

@ -151,7 +151,7 @@ class ISBNDB(Source):
bl = feed.find('BookList')
if bl is None:
err = tostring(etree.find('errormessage'))
err = tostring(feed.find('errormessage'))
raise ValueError('ISBNDb query failed:' + err)
total_results = int(bl.get('total_results'))
shown_results = int(bl.get('shown_results'))

File diff suppressed because it is too large Load Diff

View File

@ -1,8 +1,12 @@
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import (unicode_literals, division, absolute_import,
print_function)
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
'''
Decompress MOBI files compressed with the Huff/cdic algorithm. Code thanks to darkninja
and igorsk.
@ -12,82 +16,92 @@ import struct
from calibre.ebooks.mobi import MobiError
class BitReader(object):
class Reader(object):
def __init__(self, data):
self.data, self.pos, self.nbits = data + "\x00\x00\x00\x00", 0, len(data) * 8
def __init__(self):
self.q = struct.Struct(b'>Q').unpack_from
def peek(self, n):
r, g = 0, 0
while g < n:
r, g = (r << 8) | ord(self.data[(self.pos+g)>>3]), g + 8 - ((self.pos+g) & 7)
return (r >> (g - n)) & ((1 << n) - 1)
def load_huff(self, huff):
if huff[0:8] != b'HUFF\x00\x00\x00\x18':
raise MobiError('Invalid HUFF header')
off1, off2 = struct.unpack_from(b'>LL', huff, 8)
def eat(self, n):
self.pos += n
return self.pos <= self.nbits
def dict1_unpack(v):
codelen, term, maxcode = v&0x1f, v&0x80, v>>8
assert codelen != 0
if codelen <= 8:
assert term
maxcode = ((maxcode + 1) << (32 - codelen)) - 1
return (codelen, term, maxcode)
self.dict1 = map(dict1_unpack, struct.unpack_from(b'>256L', huff, off1))
def left(self):
return self.nbits - self.pos
dict2 = struct.unpack_from(b'>64L', huff, off2)
self.mincode, self.maxcode = (), ()
for codelen, mincode in enumerate((0,) + dict2[0::2]):
self.mincode += (mincode << (32 - codelen), )
for codelen, maxcode in enumerate((0,) + dict2[1::2]):
self.maxcode += (((maxcode + 1) << (32 - codelen)) - 1, )
self.dictionary = []
def load_cdic(self, cdic):
if cdic[0:8] != b'CDIC\x00\x00\x00\x10':
raise MobiError('Invalid CDIC header')
phrases, bits = struct.unpack_from(b'>LL', cdic, 8)
n = min(1<<bits, phrases-len(self.dictionary))
h = struct.Struct(b'>H').unpack_from
def getslice(off):
blen, = h(cdic, 16+off)
slice = cdic[18+off:18+off+(blen&0x7fff)]
return (slice, blen&0x8000)
self.dictionary += map(getslice, struct.unpack_from(b'>%dH' % n, cdic, 16))
def unpack(self, data):
q = self.q
bitsleft = len(data) * 8
data += b'\x00\x00\x00\x00\x00\x00\x00\x00'
pos = 0
x, = q(data, pos)
n = 32
s = []
while True:
if n <= 0:
pos += 4
x, = q(data, pos)
n += 32
code = (x >> n) & ((1 << 32) - 1)
codelen, term, maxcode = self.dict1[code >> 24]
if not term:
while code < self.mincode[codelen]:
codelen += 1
maxcode = self.maxcode[codelen]
n -= codelen
bitsleft -= codelen
if bitsleft < 0:
break
r = (maxcode - code) >> (32 - codelen)
slice_, flag = self.dictionary[r]
if not flag:
self.dictionary[r] = None
slice_ = self.unpack(slice_)
self.dictionary[r] = (slice_, 1)
s.append(slice_)
return b''.join(s)
class HuffReader(object):
def __init__(self, huffs):
self.huffs = huffs
self.reader = Reader()
self.reader.load_huff(huffs[0])
for cdic in huffs[1:]:
self.reader.load_cdic(cdic)
if huffs[0][0:4] != 'HUFF' or huffs[0][4:8] != '\x00\x00\x00\x18':
raise MobiError('Invalid HUFF header')
def unpack(self, section):
return self.reader.unpack(section)
if huffs[1][0:4] != 'CDIC' or huffs[1][4:8] != '\x00\x00\x00\x10':
raise ValueError('Invalid CDIC header')
self.entry_bits, = struct.unpack('>L', huffs[1][12:16])
off1,off2 = struct.unpack('>LL', huffs[0][16:24])
self.dict1 = struct.unpack('<256L', huffs[0][off1:off1+256*4])
self.dict2 = struct.unpack('<64L', huffs[0][off2:off2+64*4])
self.dicts = huffs[1:]
self.r = ''
def _unpack(self, bits, depth = 0):
if depth > 32:
raise MobiError('Corrupt file')
while bits.left():
dw = bits.peek(32)
v = self.dict1[dw >> 24]
codelen = v & 0x1F
assert codelen != 0
code = dw >> (32 - codelen)
r = (v >> 8)
if not (v & 0x80):
while code < self.dict2[(codelen-1)*2]:
codelen += 1
code = dw >> (32 - codelen)
r = self.dict2[(codelen-1)*2+1]
r -= code
assert codelen != 0
if not bits.eat(codelen):
return
dicno = r >> self.entry_bits
off1 = 16 + (r - (dicno << self.entry_bits)) * 2
dic = self.dicts[dicno]
off2 = 16 + ord(dic[off1]) * 256 + ord(dic[off1+1])
blen = ord(dic[off2]) * 256 + ord(dic[off2+1])
slice = dic[off2+2:off2+2+(blen&0x7fff)]
if blen & 0x8000:
self.r += slice
else:
self._unpack(BitReader(slice), depth + 1)
def unpack(self, data):
self.r = ''
self._unpack(BitReader(data))
return self.r
def decompress(self, sections):
r = ''
for data in sections:
r += self.unpack(data)
if r.endswith('#'):
r = r[:-1]
return r

View File

@ -0,0 +1,86 @@
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import (unicode_literals, division, absolute_import,
print_function)
__license__ = 'GPL v3'
__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import os, subprocess, shutil, tempfile
from lxml import etree
from calibre.constants import iswindows
from calibre.customize.ui import plugin_for_output_format
from calibre.ptempfile import TemporaryDirectory
from calibre.ebooks.mobi.utils import detect_periodical
from calibre import CurrentDir
exe = 'kindlegen.exe' if iswindows else 'kindlegen'
def refactor_opf(opf, is_periodical, toc):
with open(opf, 'rb') as f:
root = etree.fromstring(f.read())
'''
for spine in root.xpath('//*[local-name() = "spine" and @toc]'):
# Do not use the NCX toc as kindlegen requires the section structure
# in the TOC to be duplicated in the HTML, asinine!
del spine.attrib['toc']
'''
if is_periodical:
metadata = root.xpath('//*[local-name() = "metadata"]')[0]
xm = etree.SubElement(metadata, 'x-metadata')
xm.tail = '\n'
xm.text = '\n\t'
mobip = etree.SubElement(xm, 'output', attrib={'encoding':"utf-8",
'content-type':"application/x-mobipocket-subscription-magazine"})
mobip.tail = '\n\t'
with open(opf, 'wb') as f:
f.write(etree.tostring(root, method='xml', encoding='utf-8',
xml_declaration=True))
def refactor_guide(oeb):
for key in list(oeb.guide):
if key not in ('toc', 'start', 'masthead'):
oeb.guide.remove(key)
def run_kindlegen(opf, log):
log.info('Running kindlegen on MOBIML created by calibre')
oname = os.path.splitext(opf)[0] + '.mobi'
p = subprocess.Popen([exe, opf, '-c1', '-verbose', '-o', oname],
stderr=subprocess.STDOUT, stdout=subprocess.PIPE)
ko = p.stdout.read()
returncode = p.wait()
log.debug('kindlegen verbose output:')
log.debug(ko.decode('utf-8', 'replace'))
log.info('kindlegen returned returncode: %d'%returncode)
if not os.path.exists(oname) or os.stat(oname).st_size < 100:
raise RuntimeError('kindlegen did not produce any output. '
'kindlegen return code: %d'%returncode)
return oname
def kindlegen(oeb, opts, input_plugin, output_path):
is_periodical = detect_periodical(oeb.toc, oeb.log)
refactor_guide(oeb)
with TemporaryDirectory('_kindlegen_output') as tdir:
oeb_output = plugin_for_output_format('oeb')
oeb_output.convert(oeb, tdir, input_plugin, opts, oeb.log)
opf = [x for x in os.listdir(tdir) if x.endswith('.opf')][0]
refactor_opf(os.path.join(tdir, opf), is_periodical, oeb.toc)
try:
td = tempfile.gettempdir()
kd = os.path.join(td, 'kindlegen')
if os.path.exists(kd):
shutil.rmtree(kd)
shutil.copytree(tdir, kd)
oeb.log('kindlegen intermediate output stored in: %s'%kd)
except:
pass
with CurrentDir(tdir):
oname = run_kindlegen(opf, oeb.log)
shutil.copyfile(oname, output_path)

View File

@ -532,7 +532,7 @@ class MobiMLizer(object):
bstate.pbreak = True
if isblock:
para = bstate.para
if para is not None and para.text == u'\xa0':
if para is not None and para.text == u'\xa0' and len(para) < 1:
para.getparent().replace(para, etree.Element(XHTML('br')))
bstate.para = None
bstate.istate = None

View File

@ -27,7 +27,7 @@ class MOBIOutput(OutputFormatPlugin):
),
OptionRecommendation(name='no_inline_toc',
recommended_value=False, level=OptionRecommendation.LOW,
help=_('Don\'t add Table of Contents to end of book. Useful if '
help=_('Don\'t add Table of Contents to the book. Useful if '
'the book has its own table of contents.')),
OptionRecommendation(name='toc_title', recommended_value=None,
help=_('Title for any generated in-line table of contents.')
@ -45,6 +45,30 @@ class MOBIOutput(OutputFormatPlugin):
'the MOBI output plugin will try to convert margins specified'
' in the input document, otherwise it will ignore them.')
),
OptionRecommendation(name='mobi_toc_at_start',
recommended_value=False,
help=_('When adding the Table of Contents to the book, add it at the start of the '
'book instead of the end. Not recommended.')
),
OptionRecommendation(name='extract_to', recommended_value=None,
help=_('Extract the contents of the MOBI file to the'
' specified directory. If the directory already '
'exists, it will be deleted.')
),
OptionRecommendation(name='mobi_navpoints_only_deepest',
recommended_value=False,
help=_('When adding navpoints for the chapter-to-chapter'
' navigation on the kindle, use only the lowest level '
'of items in the TOC, instead of items at every level.')
),
OptionRecommendation(name='kindlegen',
recommended_value=False,
help=('Use kindlegen (must be in your PATH) to generate the'
' binary wrapper for the MOBI format. Useful to debug '
' the calibre MOBI output.')
),
])
def check_for_periodical(self):
@ -76,26 +100,6 @@ class MOBIOutput(OutputFormatPlugin):
else:
self.oeb.log.debug('Using mastheadImage supplied in manifest...')
def dump_toc(self, toc) :
self.log( "\n >>> TOC contents <<<")
self.log( " toc.title: %s" % toc.title)
self.log( " toc.href: %s" % toc.href)
for periodical in toc.nodes :
self.log( "\tperiodical title: %s" % periodical.title)
self.log( "\t href: %s" % periodical.href)
for section in periodical :
self.log( "\t\tsection title: %s" % section.title)
self.log( "\t\tfirst article: %s" % section.href)
for article in section :
self.log( "\t\t\tarticle title: %s" % repr(article.title))
self.log( "\t\t\t href: %s" % article.href)
def dump_manifest(self) :
self.log( "\n >>> Manifest entries <<<")
for href in self.oeb.manifest.hrefs :
self.log ("\t%s" % href)
def periodicalize_toc(self):
from calibre.ebooks.oeb.base import TOC
toc = self.oeb.toc
@ -150,24 +154,16 @@ class MOBIOutput(OutputFormatPlugin):
# Fix up the periodical href to point to first section href
toc.nodes[0].href = toc.nodes[0].nodes[0].href
# GR diagnostics
if self.opts.verbose > 3:
self.dump_toc(toc)
self.dump_manifest()
def convert(self, oeb, output_path, input_plugin, opts, log):
self.log, self.opts, self.oeb = log, opts, oeb
from calibre.ebooks.mobi.writer import PALM_MAX_IMAGE_SIZE, \
MobiWriter, PALMDOC, UNCOMPRESSED
from calibre.ebooks.mobi.mobiml import MobiMLizer
from calibre.ebooks.oeb.transforms.manglecase import CaseMangler
from calibre.ebooks.oeb.transforms.rasterize import SVGRasterizer, Unavailable
from calibre.ebooks.oeb.transforms.htmltoc import HTMLTOCAdder
from calibre.customize.ui import plugin_for_input_format
imagemax = PALM_MAX_IMAGE_SIZE if opts.rescale_images else None
if not opts.no_inline_toc:
tocadder = HTMLTOCAdder(title=opts.toc_title)
tocadder = HTMLTOCAdder(title=opts.toc_title, position='start' if
opts.mobi_toc_at_start else 'end')
tocadder(oeb, opts)
mangler = CaseMangler()
mangler(oeb, opts)
@ -179,10 +175,23 @@ class MOBIOutput(OutputFormatPlugin):
mobimlizer = MobiMLizer(ignore_tables=opts.linearize_tables)
mobimlizer(oeb, opts)
self.check_for_periodical()
write_page_breaks_after_item = not input_plugin is plugin_for_input_format('cbz')
writer = MobiWriter(opts, imagemax=imagemax,
compression=UNCOMPRESSED if opts.dont_compress else PALMDOC,
prefer_author_sort=opts.prefer_author_sort,
write_page_breaks_after_item = input_plugin is not plugin_for_input_format('cbz')
from calibre.utils.config import tweaks
if tweaks.get('new_mobi_writer', False):
from calibre.ebooks.mobi.writer2.main import MobiWriter
MobiWriter
else:
from calibre.ebooks.mobi.writer import MobiWriter
if opts.kindlegen:
from calibre.ebooks.mobi.kindlegen import kindlegen
kindlegen(oeb, opts, input_plugin, output_path)
else:
writer = MobiWriter(opts,
write_page_breaks_after_item=write_page_breaks_after_item)
writer(oeb, output_path)
if opts.extract_to is not None:
from calibre.ebooks.mobi.debug import inspect_mobi
ddir = opts.extract_to
inspect_mobi(output_path, ddir=ddir)

View File

@ -859,16 +859,19 @@ class MobiReader(object):
processed_records += list(range(self.book_header.huff_offset,
self.book_header.huff_offset + self.book_header.huff_number))
huff = HuffReader(huffs)
self.mobi_html = huff.decompress(text_sections)
unpack = huff.unpack
elif self.book_header.compression_type == '\x00\x02':
for section in text_sections:
self.mobi_html += decompress_doc(section)
unpack = decompress_doc
elif self.book_header.compression_type == '\x00\x01':
self.mobi_html = ''.join(text_sections)
unpack = lambda x: x
else:
raise MobiError('Unknown compression algorithm: %s' % repr(self.book_header.compression_type))
self.mobi_html = b''.join(map(unpack, text_sections))
if self.mobi_html.endswith(b'#'):
self.mobi_html = self.mobi_html[:-1]
if self.book_header.ancient and '<html' not in self.mobi_html[:300].lower():
self.mobi_html = self.mobi_html.replace('\r ', '\n\n ')
self.mobi_html = self.mobi_html.replace('\0', '')
@ -933,6 +936,9 @@ class MobiReader(object):
continue
processed_records.append(i)
data = self.sections[i][0]
if data[:4] in (b'FLIS', b'FCIS', b'SRCS', b'\xe9\x8e\r\n'):
# A FLIS, FCIS, SRCS or EOF record, ignore
continue
buf = cStringIO.StringIO(data)
image_index += 1
try:

View File

@ -0,0 +1,363 @@
Reverse engineering the trailing byte sequences for hierarchical periodicals
===============================================================================
In the following, *vwi* means variable width integer and *fvwi* means a vwi whose lowest four bits are used as a flag. All the following information/inferences are from examining the output of kindlegen on a sample periodical. Given the general level of Amazon's incompetence, there are no guarantees that this information is the *best/most complete* way to do TBS indexing.
Sequence encoding:
0b1000 : Continuation bit
First sequences:
0b0010 : 80
0b0011 : 80 80
0b0110 : 80 2
0b0111 : 80 2 80
Other sequences:
0b0101 : 4 1a
0b0001 : c b1
Opening record
----------------
The text record that contains the opening node for the periodical (depth=0 node in the NCX) can have TBS of 3 different forms:
1. If it has only the periodical node and no section/article nodes, TBS of type 2, like this::
Record #1: Starts at: 0 Ends at: 4095
Contains: 1 index entries (0 ends, 0 complete, 1 starts)
TBS bytes: 82 80
Starts:
Index Entry: 0 (Parent index: -1, Depth: 0, Offset: 215, Size: 68470) [j_x's Google reader]
TBS Type: 010 (2)
Outer Index entry: 0
Unknown (vwi: always 0?): 0
2. A periodical and a section node, but no article nodes, TBS type of 6, like this::
Record #1: Starts at: 0 Ends at: 4095
Contains: 2 index entries (0 ends, 0 complete, 2 starts)
TBS bytes: 86 80 2
Starts:
Index Entry: 0 (Parent index: -1, Depth: 0, Offset: 215, Size: 93254) [j_x's Google reader]
Index Entry: 1 (Parent index: 0, Depth: 1, Offset: 541, Size: 49280) [Ars Technica]
TBS Type: 110 (6)
Outer Index entry: 0
Unknown (vwi: always 0?): 0
Unknown (byte: always 2?): 2
3. If it has both the section 1 node and at least one article node, TBS of type 6, like this::
Record #1: Starts at: 0 Ends at: 4095
Contains: 4 index entries (0 ends, 1 complete, 3 starts)
TBS bytes: 86 80 2 c4 2
Complete:
Index Entry: 5 (Parent index: 1, Depth: 2, Offset: 549, Size: 1866) [Week in gaming: 3DS review, Crysis 2, George Hotz]
Starts:
Index Entry: 0 (Parent index: -1, Depth: 0, Offset: 215, Size: 79253) [j_x's Google reader]
Index Entry: 1 (Parent index: 0, Depth: 1, Offset: 541, Size: 35279) [Ars Technica]
Index Entry: 6 (Parent index: 1, Depth: 2, Offset: 2415, Size: 2764) [Week in Apple: ZFS on Mac OS X, rogue tethering, DUI apps, and more]
TBS Type: 110 (6)
Outer Index entry: 0
Unknown (vwi: always 0?): 0
Unknown (byte: always 2?): 2
Article index at start of record or first article index, relative to parent section (fvwi): 4 [5 absolute]
Number of article nodes in the record (byte): 2
If there was only a single article, instead of 2, then the last two bytes would be: c0, i.e. there would be no byte giving the number of articles in the record.
Starting record with two section transitions::
Record #1: Starts at: 0 Ends at: 4095
Contains: 7 index entries (0 ends, 4 complete, 3 starts)
TBS bytes: 86 80 2 c0 b8 c4 3
Complete:
Index Entry: 1 (Parent index: 0, Depth: 1, Offset: 564, Size: 375) [Ars Technica]
Index Entry: 5 (Parent index: 1, Depth: 2, Offset: 572, Size: 367) [Week in gaming: 3DS review, Crysis 2, George Hotz]
Index Entry: 6 (Parent index: 2, Depth: 2, Offset: 947, Size: 1014) [Max and the Magic Marker for iPad: Review]
Index Entry: 7 (Parent index: 2, Depth: 2, Offset: 1961, Size: 1077) [iPad 2 steers itself into home console gaming territory with Real Racing 2 HD]
Starts:
Index Entry: 0 (Parent index: -1, Depth: 0, Offset: 215, Size: 35372) [j_x's Google reader]
Index Entry: 2 (Parent index: 0, Depth: 1, Offset: 939, Size: 10368) [Neowin.net]
Index Entry: 8 (Parent index: 2, Depth: 2, Offset: 3038, Size: 1082) [Microsoft's Joe Belfiore still working on upcoming Zune hardware]
TBS Type: 110 (6)
Outer Index entry: 0
Unknown (vwi: always 0?): 0
Unknown (byte: always 2?): 2
Article index at start of record or first article index, relative to parent section (fvwi): 4 [5 absolute]
Remaining bytes: b8 c4 3
Starting record with three section transitions::
Record #1: Starts at: 0 Ends at: 4095
Contains: 10 index entries (0 ends, 7 complete, 3 starts)
TBS bytes: 86 80 2 c0 b8 c0 b8 c4 4
Complete:
Index Entry: 1 (Parent index: 0, Depth: 1, Offset: 564, Size: 375) [Ars Technica]
Index Entry: 2 (Parent index: 0, Depth: 1, Offset: 939, Size: 316) [Neowin.net]
Index Entry: 5 (Parent index: 1, Depth: 2, Offset: 572, Size: 367) [Week in gaming: 3DS review, Crysis 2, George Hotz]
Index Entry: 6 (Parent index: 2, Depth: 2, Offset: 947, Size: 308) [Max and the Magic Marker for iPad: Review]
Index Entry: 7 (Parent index: 3, Depth: 2, Offset: 1263, Size: 760) [OSnews Asks on Interrupts: The Results]
Index Entry: 8 (Parent index: 3, Depth: 2, Offset: 2023, Size: 693) [Apple Ditches SAMBA in Favour of Homegrown Replacement]
Index Entry: 9 (Parent index: 3, Depth: 2, Offset: 2716, Size: 747) [ITC: Apple's Mobile Products Do Not Violate Nokia Patents]
Starts:
Index Entry: 0 (Parent index: -1, Depth: 0, Offset: 215, Size: 25320) [j_x's Google reader]
Index Entry: 3 (Parent index: 0, Depth: 1, Offset: 1255, Size: 6829) [OSNews]
Index Entry: 10 (Parent index: 3, Depth: 2, Offset: 3463, Size: 666) [Transparent Monitor Embedded in Window Glass]
TBS Type: 110 (6)
Outer Index entry: 0
Unknown (vwi: always 0?): 0
Unknown (byte: always 2?): 2
Article index at start of record or first article index, relative to parent section (fvwi): 4 [5 absolute]
Remaining bytes: b8 c0 b8 c4 4
Records with no nodes
------------------------
subtype = 010
These records are spanned by a single article. They are of two types:
1. If the parent section index is 1, TBS type of 6, like this::
Record #4: Starts at: 12288 Ends at: 16383
Contains: 0 index entries (0 ends, 0 complete, 0 starts)
TBS bytes: 86 80 2 c1 80
TBS Type: 110 (6)
Outer Index entry: 0
Unknown (vwi: always 0?): 0
Unknown (byte: always 2?): 2
Article index at start of record or first article index, relative to parent section (fvwi): 4 [5 absolute]
EOF (vwi: should be 0): 0
If the record is before the first article, the TBS bytes would be: 86 80 2
2. If the parent section index is > 1, TBS type of 2, like this::
Record #14: Starts at: 53248 Ends at: 57343
Contains: 0 index entries (0 ends, 0 complete, 0 starts)
TBS bytes: 82 80 a0 1 e1 80
TBS Type: 010 (2)
Outer Index entry: 0
Unknown (vwi: always 0?): 0
Parent section index (fvwi): 2
Flags: 0
Article index at start of record or first article index, relative to parent section (fvwi): 14 [16 absolute]
EOF (vwi: should be 0): 0
Records with only article nodes
-----------------------------------
Such records have no section transitions (i.e. a section end/section start pair). They have only one or more article nodes. They are of two types:
1. If the parent section index is 1, TBS type of 7, like this::
Record #6: Starts at: 20480 Ends at: 24575
Contains: 2 index entries (1 ends, 0 complete, 1 starts)
TBS bytes: 87 80 2 80 1 84 2
Ends:
Index Entry: 9 (Parent index: 1, Depth: 2, Offset: 16453, Size: 4199) [Vaccine's success spurs whooping cough comeback]
Starts:
Index Entry: 10 (Parent index: 1, Depth: 2, Offset: 20652, Size: 4246) [Apple's mobile products do not violate Nokia patents, says ITC]
TBS Type: 111 (7)
Outer Index entry: 0
Unknown (vwi: always 0?): 0
Unknown: '\x02\x80' (vwi?: Always 256)
Article at start of record (fvwi): 8
Number of articles in record (byte): 2
If there was only one article in the record, the last two bytes would be replaced by a single byte: 80
If this record is the first record with an article, then the article at the start of the record should be the last section index. At least, that's what kindlegen does, though if you ask me, it should be the first section index.
2. If the parent section index is > 1, TBS type of 2, like this::
Record #16: Starts at: 61440 Ends at: 65535
Contains: 5 index entries (1 ends, 3 complete, 1 starts)
TBS bytes: 82 80 a1 80 1 f4 5
Ends:
Index Entry: 17 (Parent index: 2, Depth: 2, Offset: 60920, Size: 1082) [Microsoft's Joe Belfiore still working on upcoming Zune hardware]
Complete:
Index Entry: 18 (Parent index: 2, Depth: 2, Offset: 62002, Size: 1016) [Rumour: OS X Lion nearing Golden Master stage]
Index Entry: 19 (Parent index: 2, Depth: 2, Offset: 63018, Size: 1045) [iOS 4.3.1 released]
Index Entry: 20 (Parent index: 2, Depth: 2, Offset: 64063, Size: 972) [Windows 8 'system reset' image leaks]
Starts:
Index Entry: 21 (Parent index: 2, Depth: 2, Offset: 65035, Size: 1057) [Windows Phone 7: Why it's failing]
TBS Type: 010 (2)
Outer Index entry: 0
Unknown (vwi: always 0?): 0
Parent section index (fvwi) : 2
Flags: 1
Unknown (vwi: always 0?): 0
Article index at start of record or first article index, relative to parent section (fvwi): 15 [17 absolute]
Number of article nodes in the record (byte): 5
If there was only one article in the record, the last two bytes would be replaced by a single byte: f0
Records with a section transition
-----------------------------------
In such a record there is a transition from one section to the next. As such the record must have at least one article ending and one article starting, except in the case of the first section.
1. The first section::
Record #2: Starts at: 4096 Ends at: 8191
Contains: 2 index entries (0 ends, 0 complete, 2 starts)
TBS bytes: 83 80 80 90 c0
Starts:
Index Entry: 1 (Parent index: 0, Depth: 1, Offset: 7758, Size: 26279) [Ars Technica]
Index Entry: 5 (Parent index: 1, Depth: 2, Offset: 7766, Size: 1866) [Week in gaming: 3DS review, Crysis 2, George Hotz]
TBS Type: 011 (3)
Outer Index entry: 0
Unknown (vwi: always 0?): 0
Unknown (vwi: always 0?): 0
First section index (fvwi) : 1
Extra bits: 0
First section starts
Article at start of block as offset from parent index (fvwi): 4 [5 absolute]
Flags: 0
If there was more than one article at the start then the last byte would be replaced by: c4 n where n is the number of articles
2. A record with a section transition and only one article from the ending section::
Record #9: Starts at: 32768 Ends at: 36863
Contains: 6 index entries (2 ends, 2 complete, 2 starts)
TBS bytes: 83 80 80 90 1 d0 1 c8 1 d4 3
Ends:
Index Entry: 1 (Parent index: 0, Depth: 1, Offset: 7758, Size: 26279) [Ars Technica]
Index Entry: 14 (Parent index: 1, Depth: 2, Offset: 31929, Size: 2108) [Trademarked keyword sales may soon be restricted in Europe]
Complete:
Index Entry: 15 (Parent index: 2, Depth: 2, Offset: 34045, Size: 1014) [Max and the Magic Marker for iPad: Review]
Index Entry: 16 (Parent index: 2, Depth: 2, Offset: 35059, Size: 1077) [iPad 2 steers itself into home console gaming territory with Real Racing 2 HD]
Starts:
Index Entry: 2 (Parent index: 0, Depth: 1, Offset: 34037, Size: 10368) [Neowin.net]
Index Entry: 17 (Parent index: 2, Depth: 2, Offset: 36136, Size: 1082) [Microsoft's Joe Belfiore still working on upcoming Zune hardware]
TBS Type: 011 (3)
Outer Index entry: 0
Unknown (vwi: always 0?): 0
Unknown (vwi: always 0?): 0
First section index (fvwi): 1
Extra bits (flag: always 0?): 0
First article of ending section, relative to its parent's index (fvwi): 13 [14 absolute]
Last article of ending section w.r.t. starting section offset (fvwi): 12 [14 absolute]
Flags (always 8?): 8
Article index at start of record or first article index, relative to parent section (fvwi): 13 [15 absolute]
Number of article nodes in the record (byte): 3
3. A record with a section transition and more than one article from the ending section::
Record #11: Starts at: 40960 Ends at: 45055
Contains: 7 index entries (2 ends, 3 complete, 2 starts)
TBS bytes: 83 80 80 a0 2 b5 4 1a f5 2 d8 2 e0
Ends:
Index Entry: 2 (Parent index: 0, Depth: 1, Offset: 34037, Size: 10368) [Neowin.net]
Index Entry: 21 (Parent index: 2, Depth: 2, Offset: 40251, Size: 1057) [Windows Phone 7: Why it's failing]
Complete:
Index Entry: 22 (Parent index: 2, Depth: 2, Offset: 41308, Size: 1050) [RIM announces Android app support for Blackberry Playbook]
Index Entry: 23 (Parent index: 2, Depth: 2, Offset: 42358, Size: 1087) [Microsoft buys $7.5m worth of IPv4 addresses]
Index Entry: 24 (Parent index: 2, Depth: 2, Offset: 43445, Size: 960) [TechSpot: Apple iPad 2 Review]
Starts:
Index Entry: 3 (Parent index: 0, Depth: 1, Offset: 44405, Size: 6829) [OSNews]
Index Entry: 25 (Parent index: 3, Depth: 2, Offset: 44413, Size: 760) [OSnews Asks on Interrupts: The Results]
TBS Type: 011 (3)
Outer Index entry: 0
Unknown (vwi: always 0?): 0
Unknown (vwi: always 0?): 0
First section index (fvwi): 2
Extra bits (flag: always 0?): 0
First article of ending section, relative to its parent's index (fvwi): 19 [21 absolute]
Number of article nodes in the record (byte): 4
->Offset from start of record to beginning of last starting section in this record (vwi)): 3445
Last article of ending section w.r.t. starting section offset (fvwi): 21 [24 absolute]
Flags (always 8?): 8
Article index at start of record or first article index, relative to parent section (fvwi): 22 [25 absolute]
The difference to the previous case is the extra two bytes that encode the offset of the opening section from the start of the record.
4. A record with multiple section transitions::
Record #9: Starts at: 32768 Ends at: 36863
Contains: 9 index entries (2 ends, 5 complete, 2 starts)
TBS bytes: 83 80 80 90 1 d0 1 c8 1 d1 c b1 1 c8 1 d4 4
Ends:
Index Entry: 1 (Parent index: 0, Depth: 1, Offset: 7758, Size: 26279) [Ars Technica]
Index Entry: 14 (Parent index: 1, Depth: 2, Offset: 31929, Size: 2108) [Trademarked keyword sales may soon be restricted in Europe]
Complete:
Index Entry: 2 (Parent index: 0, Depth: 1, Offset: 34037, Size: 316) [Neowin.net]
Index Entry: 15 (Parent index: 2, Depth: 2, Offset: 34045, Size: 308) [Max and the Magic Marker for iPad: Review]
Index Entry: 16 (Parent index: 3, Depth: 2, Offset: 34361, Size: 760) [OSnews Asks on Interrupts: The Results]
Index Entry: 17 (Parent index: 3, Depth: 2, Offset: 35121, Size: 693) [Apple Ditches SAMBA in Favour of Homegrown Replacement]
Index Entry: 18 (Parent index: 3, Depth: 2, Offset: 35814, Size: 747) [ITC: Apple's Mobile Products Do Not Violate Nokia Patents]
Starts:
Index Entry: 3 (Parent index: 0, Depth: 1, Offset: 34353, Size: 6829) [OSNews]
Index Entry: 19 (Parent index: 3, Depth: 2, Offset: 36561, Size: 666) [Transparent Monitor Embedded in Window Glass]
TBS Type: 011 (3)
Outer Index entry: 0
Unknown (vwi: always 0?): 0
Unknown (vwi: always 0?): 0
First section index (fvwi): 1
Extra bits (flag: always 0?): 0
First article of ending section, relative to its parent's index (fvwi): 13 [14 absolute]
Last article of ending section w.r.t. starting section offset (fvwi): 12 [14 absolute]
Flags (always 8?): 8
Article index at start of record or first article index, relative to parent section (fvwi): 13 [15 absolute]
->Offset from start of record to beginning ofnext starting section in this record: 1585
Last article of ending section w.r.t. starting section offset (fvwi): 12 [15 absolute]
Flags (always 8?): 8
Article index at start of record or first article index, relative to parent section (fvwi): 13 [16 absolute]
Number of article nodes in the record belonging ot the last section (byte): 4
Ending record
----------------
Logically, ending records must have at least one article ending, one section ending and the periodical ending. They are of TBS type 2, like this::
Record #17: Starts at: 65536 Ends at: 68684
Contains: 4 index entries (3 ends, 1 complete, 0 starts)
TBS bytes: 82 80 c0 4 f4 2
Ends:
Index Entry: 0 (Parent index: -1, Depth: 0, Offset: 215, Size: 68470) [j_x's Google reader]
Index Entry: 4 (Parent index: 0, Depth: 1, Offset: 51234, Size: 17451) [Slashdot]
Index Entry: 43 (Parent index: 4, Depth: 2, Offset: 65422, Size: 1717) [US ITC May Reverse Judge&#39;s Ruling In Kodak vs. Apple]
Complete:
Index Entry: 44 (Parent index: 4, Depth: 2, Offset: 67139, Size: 1546) [Google Starts Testing Google Music Internally]
TBS Type: 010 (2)
Outer Index entry: 0
Unknown (vwi: always 0?): 0
Parent section index (fvwi): 4
Flags: 0
Article at start of block as offset from parent index (fvwi): 39 [43 absolute]
Number of nodes (byte): 2
If the record had only a single article end, the last two bytes would be replaced with: f0
If the last record has multiple section transitions, it is of type 6 and looks like::
Record #9: Starts at: 32768 Ends at: 34953
Contains: 9 index entries (3 ends, 6 complete, 0 starts)
TBS bytes: 86 80 2 1 d0 1 c8 1 d0 1 c8 1 d0 1 c8 1 d0
Ends:
Index Entry: 0 (Parent index: -1, Depth: 0, Offset: 215, Size: 34739) [j_x's Google reader]
Index Entry: 1 (Parent index: 0, Depth: 1, Offset: 7758, Size: 26279) [Ars Technica]
Index Entry: 14 (Parent index: 1, Depth: 2, Offset: 31929, Size: 2108) [Trademarked keyword sales may soon be restricted in Europe]
Complete:
Index Entry: 2 (Parent index: 0, Depth: 1, Offset: 34037, Size: 316) [Neowin.net]
Index Entry: 3 (Parent index: 0, Depth: 1, Offset: 34353, Size: 282) [OSNews]
Index Entry: 4 (Parent index: 0, Depth: 1, Offset: 34635, Size: 319) [Slashdot]
Index Entry: 15 (Parent index: 2, Depth: 2, Offset: 34045, Size: 308) [Max and the Magic Marker for iPad: Review]
Index Entry: 16 (Parent index: 3, Depth: 2, Offset: 34361, Size: 274) [OSnews Asks on Interrupts: The Results]
Index Entry: 17 (Parent index: 4, Depth: 2, Offset: 34643, Size: 311) [Leonard Nimoy Turns 80]
TBS Type: 110 (6)
Outer Index entry: 0
Unknown (vwi: always 0?): 0
Unknown (byte: always 2?): 2
Article index at start of record or first article index, relative to parent section (fvwi): 13 [14 absolute]
Remaining bytes: 1 c8 1 d0 1 c8 1 d0 1 c8 1 d0

View File

@ -0,0 +1,341 @@
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import (unicode_literals, division, absolute_import,
print_function)
__license__ = 'GPL v3'
__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import struct
from collections import OrderedDict
from calibre.utils.magick.draw import Image, save_cover_data_to, thumbnail
from calibre.ebooks import normalize
IMAGE_MAX_SIZE = 10 * 1024 * 1024
def decode_hex_number(raw):
'''
Return a variable length number encoded using hexadecimal encoding. These
numbers have the first byte which tells the number of bytes that follow.
The bytes that follow are simply the hexadecimal representation of the
number.
:param raw: Raw binary data as a bytestring
:return: The number and the number of bytes from raw that the number
occupies
'''
length, = struct.unpack(b'>B', raw[0])
raw = raw[1:1+length]
consumed = length+1
return int(raw, 16), consumed
def encode_number_as_hex(num):
'''
Encode num as a variable length encoded hexadecimal number. Returns the
bytestring containing the encoded number. These
numbers have the first byte which tells the number of bytes that follow.
The bytes that follow are simply the hexadecimal representation of the
number.
'''
num = bytes(hex(num)[2:].upper())
nlen = len(num)
if nlen % 2 != 0:
num = b'0'+num
ans = bytearray(num)
ans.insert(0, len(num))
return bytes(ans)
def encint(value, forward=True):
'''
Some parts of the Mobipocket format encode data as variable-width integers.
These integers are represented big-endian with 7 bits per byte in bits 1-7.
They may be either forward-encoded, in which case only the first byte has bit 8 set,
or backward-encoded, in which case only the last byte has bit 8 set.
For example, the number 0x11111 = 0b10001000100010001 would be represented
forward-encoded as:
0x04 0x22 0x91 = 0b100 0b100010 0b10010001
And backward-encoded as:
0x84 0x22 0x11 = 0b10000100 0b100010 0b10001
This function encodes the integer ``value`` as a variable width integer and
returns the bytestring corresponding to it.
If forward is True the bytes returned are suitable for prepending to the
output buffer, otherwise they must be append to the output buffer.
'''
if value < 0:
raise ValueError('Cannot encode negative numbers as vwi')
# Encode vwi
byts = bytearray()
while True:
b = value & 0b01111111
value >>= 7 # shift value to the right by 7 bits
byts.append(b)
if value == 0:
break
byts[0 if forward else -1] |= 0b10000000
byts.reverse()
return bytes(byts)
def decint(raw, forward=True):
'''
Read a variable width integer from the bytestring or bytearray raw and return the
integer and the number of bytes read. If forward is True bytes are read
from the start of raw, otherwise from the end of raw.
This function is the inverse of encint above, see its docs for more
details.
'''
val = 0
byts = bytearray()
src = bytearray(raw)
if not forward:
src.reverse()
for bnum in src:
byts.append(bnum & 0b01111111)
if bnum & 0b10000000:
break
if not forward:
byts.reverse()
for byte in byts:
val <<= 7 # Shift value to the left by 7 bits
val |= byte
return val, len(byts)
def test_decint(num):
for d in (True, False):
raw = encint(num, forward=d)
sz = len(raw)
if (num, sz) != decint(raw, forward=d):
raise ValueError('Failed for num %d, forward=%r: %r != %r' % (
num, d, (num, sz), decint(raw, forward=d)))
def rescale_image(data, maxsizeb=IMAGE_MAX_SIZE, dimen=None):
'''
Convert image setting all transparent pixels to white and changing format
to JPEG. Ensure the resultant image has a byte size less than
maxsizeb.
If dimen is not None, generate a thumbnail of width=dimen, height=dimen
Returns the image as a bytestring
'''
if dimen is not None:
data = thumbnail(data, width=dimen, height=dimen,
compression_quality=90)[-1]
else:
# Replace transparent pixels with white pixels and convert to JPEG
data = save_cover_data_to(data, 'img.jpg', return_data=True)
if len(data) <= maxsizeb:
return data
orig_data = data
img = Image()
quality = 95
img.load(data)
while len(data) >= maxsizeb and quality >= 10:
quality -= 5
img.set_compression_quality(quality)
data = img.export('jpg')
if len(data) <= maxsizeb:
return data
orig_data = data
scale = 0.9
while len(data) >= maxsizeb and scale >= 0.05:
img = Image()
img.load(orig_data)
w, h = img.size
img.size = (int(scale*w), int(scale*h))
img.set_compression_quality(quality)
data = img.export('jpg')
scale -= 0.05
return data
def get_trailing_data(record, extra_data_flags):
'''
Given a text record as a bytestring and the extra data flags from the MOBI
header, return the trailing data as a dictionary, mapping bit number to
data as bytestring. Also returns the record - all trailing data.
:return: Trailing data, record - trailing data
'''
data = OrderedDict()
flags = extra_data_flags >> 1
num = 0
while flags:
num += 1
if flags & 0b1:
sz, consumed = decint(record, forward=False)
if sz > consumed:
data[num] = record[-sz:-consumed]
record = record[:-sz]
flags >>= 1
# Read multibyte chars if any
if extra_data_flags & 0b1:
# Only the first two bits are used for the size since there can
# never be more than 3 trailing multibyte chars
sz = (ord(record[-1]) & 0b11) + 1
consumed = 1
if sz > consumed:
data[0] = record[-sz:-consumed]
record = record[:-sz]
return data, record
def encode_trailing_data(raw):
'''
Given some data in the bytestring raw, return a bytestring of the form
<data><size>
where size is a backwards encoded vwi whose value is the length of the
entire returned bytestring. data is the bytestring passed in as raw.
This is the encoding used for trailing data entries at the end of text
records. See get_trailing_data() for details.
'''
lsize = 1
while True:
encoded = encint(len(raw) + lsize, forward=False)
if len(encoded) == lsize:
break
lsize += 1
return raw + encoded
def encode_fvwi(val, flags, flag_size=4):
'''
Encode the value val and the flag_size bits from flags as a fvwi. This encoding is
used in the trailing byte sequences for indexing. Returns encoded
bytestring.
'''
ans = val << flag_size
for i in xrange(flag_size):
ans |= (flags & (1 << i))
return encint(ans)
def decode_fvwi(byts, flag_size=4):
'''
Decode encoded fvwi. Returns number, flags, consumed
'''
arg, consumed = decint(bytes(byts))
val = arg >> flag_size
flags = 0
for i in xrange(flag_size):
flags |= (arg & (1 << i))
return val, flags, consumed
def decode_tbs(byts, flag_size=4):
'''
Trailing byte sequences for indexing consists of series of fvwi numbers.
This function reads the fvwi number and its associated flags. It them uses
the flags to read any more numbers that belong to the series. The flags are
the lowest 4 bits of the vwi (see the encode_fvwi function above).
Returns the fvwi number, a dictionary mapping flags bits to the associated
data and the number of bytes consumed.
'''
byts = bytes(byts)
val, flags, consumed = decode_fvwi(byts, flag_size=flag_size)
extra = {}
byts = byts[consumed:]
if flags & 0b1000 and flag_size > 3:
extra[0b1000] = True
if flags & 0b0010:
x, consumed2 = decint(byts)
byts = byts[consumed2:]
extra[0b0010] = x
consumed += consumed2
if flags & 0b0100:
extra[0b0100] = ord(byts[0])
byts = byts[1:]
consumed += 1
if flags & 0b0001:
x, consumed2 = decint(byts)
byts = byts[consumed2:]
extra[0b0001] = x
consumed += consumed2
return val, extra, consumed
def encode_tbs(val, extra, flag_size=4):
'''
Encode the number val and the extra data in the extra dict as an fvwi. See
decode_tbs above.
'''
flags = 0
for flag in extra:
flags |= flag
ans = encode_fvwi(val, flags, flag_size=flag_size)
if 0b0010 in extra:
ans += encint(extra[0b0010])
if 0b0100 in extra:
ans += bytes(bytearray([extra[0b0100]]))
if 0b0001 in extra:
ans += encint(extra[0b0001])
return ans
def utf8_text(text):
'''
Convert a possibly null string to utf-8 bytes, guaranteeing to return a non
empty, normalized bytestring.
'''
if text and text.strip():
text = text.strip()
if not isinstance(text, unicode):
text = text.decode('utf-8', 'replace')
text = normalize(text).encode('utf-8')
else:
text = _('Unknown').encode('utf-8')
return text
def align_block(raw, multiple=4, pad=b'\0'):
'''
Return raw with enough pad bytes append to ensure its length is a multiple
of 4.
'''
extra = len(raw) % multiple
if extra == 0: return raw
return raw + pad*(multiple - extra)
def detect_periodical(toc, log=None):
'''
Detect if the TOC object toc contains a periodical that conforms to the
structure required by kindlegen to generate a periodical.
'''
for node in toc.iterdescendants():
if node.depth() == 1 and node.klass != 'article':
if log is not None:
log.debug(
'Not a periodical: Deepest node does not have '
'class="article"')
return False
if node.depth() == 2 and node.klass != 'section':
if log is not None:
log.debug(
'Not a periodical: Second deepest node does not have'
' class="section"')
return False
if node.depth() == 3 and node.klass != 'periodical':
if log is not None:
log.debug('Not a periodical: Third deepest node'
' does not have class="periodical"')
return False
if node.depth() > 3:
if log is not None:
log.debug('Not a periodical: Has nodes of depth > 3')
return False
return True

View File

@ -111,7 +111,8 @@ def align_block(raw, multiple=4, pad='\0'):
def rescale_image(data, maxsizeb, dimen=None):
if dimen is not None:
data = thumbnail(data, width=dimen, height=dimen)[-1]
data = thumbnail(data, width=dimen[0], height=dimen[1],
compression_quality=90)[-1]
else:
# Replace transparent pixels with white pixels and convert to JPEG
data = save_cover_data_to(data, 'img.jpg', return_data=True)
@ -141,7 +142,7 @@ def rescale_image(data, maxsizeb, dimen=None):
scale -= 0.05
return data
class Serializer(object):
class Serializer(object): # {{{
NSRMAP = {'': None, XML_NS: 'xml', XHTML_NS: '', MBP_NS: 'mbp'}
def __init__(self, oeb, images, write_page_breaks_after_item=True):
@ -172,6 +173,9 @@ class Serializer(object):
hrefs = self.oeb.manifest.hrefs
buffer.write('<guide>')
for ref in self.oeb.guide.values():
# The Kindle decides where to open a book based on the presence of
# an item in the guide that looks like
# <reference type="text" title="Start" href="chapter-one.xhtml"/>
path = urldefrag(ref.href)[0]
if path not in hrefs or hrefs[path].media_type not in OEB_DOCS:
continue
@ -215,12 +219,6 @@ class Serializer(object):
self.anchor_offset = buffer.tell()
buffer.write('<body>')
self.anchor_offset_kindle = buffer.tell()
# CybookG3 'Start Reading' link
if 'text' in self.oeb.guide:
href = self.oeb.guide['text'].href
buffer.write('<a ')
self.serialize_href(href)
buffer.write(' />')
spine = [item for item in self.oeb.spine if item.linear]
spine.extend([item for item in self.oeb.spine if not item.linear])
for item in spine:
@ -315,16 +313,20 @@ class Serializer(object):
buffer.seek(hoff)
buffer.write('%010d' % ioff)
# }}}
class MobiWriter(object):
COLLAPSE_RE = re.compile(r'[ \t\r\n\v]+')
def __init__(self, opts, compression=PALMDOC, imagemax=None,
prefer_author_sort=False, write_page_breaks_after_item=True):
def __init__(self, opts,
write_page_breaks_after_item=True):
self.opts = opts
self.write_page_breaks_after_item = write_page_breaks_after_item
self._compression = compression or UNCOMPRESSED
self._imagemax = imagemax or OTHER_MAX_IMAGE_SIZE
self._prefer_author_sort = prefer_author_sort
self._compression = UNCOMPRESSED if getattr(opts, 'dont_compress',
False) else PALMDOC
self._imagemax = (PALM_MAX_IMAGE_SIZE if getattr(opts,
'rescale_images', False) else OTHER_MAX_IMAGE_SIZE)
self._prefer_author_sort = getattr(opts, 'prefer_author_sort', False)
self._primary_index_record = None
self._conforming_periodical_toc = False
self._indexable = False
@ -428,6 +430,7 @@ class MobiWriter(object):
text.seek(npos)
return data, overlap
# TBS {{{
def _generate_flat_indexed_navpoints(self):
# Assemble a HTMLRecordData instance for each HTML record
# Return True if valid, False if invalid
@ -1172,6 +1175,8 @@ class MobiWriter(object):
self._tbSequence = tbSequence
# }}}
def _evaluate_periodical_toc(self):
'''
Periodical:
@ -1229,6 +1234,9 @@ class MobiWriter(object):
self._oeb.logger.info(' Compressing markup content...')
data, overlap = self._read_text_record(text)
if not self.opts.mobi_periodical:
self._flatten_toc()
# Evaluate toc for conformance
if self.opts.mobi_periodical :
self._oeb.logger.info(' MOBI periodical specified, evaluating TOC for periodical conformance ...')
@ -1258,11 +1266,11 @@ class MobiWriter(object):
data = compress_doc(data)
record = StringIO()
record.write(data)
# Marshall's utf-8 break code.
if WRITE_PBREAKS :
# Write trailing muti-byte sequence if any
record.write(overlap)
record.write(pack('>B', len(overlap)))
if WRITE_PBREAKS :
nextra = 0
pbreak = 0
running = offset
@ -1325,6 +1333,8 @@ class MobiWriter(object):
except:
self._oeb.logger.warn('Bad image file %r' % item.href)
continue
finally:
item.unload_data_from_memory()
self._records.append(data)
if self._first_image_record is None:
self._first_image_record = len(self._records)-1
@ -1627,7 +1637,7 @@ class MobiWriter(object):
now = int(time.time())
nrecords = len(self._records)
self._write(title, pack('>HHIIIIII', 0, 0, now, now, 0, 0, 0, 0),
'BOOK', 'MOBI', pack('>IIH', nrecords, 0, nrecords))
'BOOK', 'MOBI', pack('>IIH', (2*nrecords)-1, 0, nrecords))
offset = self._tell() + (8 * nrecords) + 2
for i, record in enumerate(self._records):
self._write(pack('>I', offset), '\0', pack('>I', 2*i)[1:])
@ -1638,6 +1648,87 @@ class MobiWriter(object):
for record in self._records:
self._write(record)
def _clean_text_value(self, text):
if text is not None and text.strip() :
text = text.strip()
if not isinstance(text, unicode):
text = text.decode('utf-8', 'replace')
text = normalize(text).encode('utf-8')
else :
text = "(none)".encode('utf-8')
return text
def _compute_offset_length(self, i, node, entries) :
h = node.href
if h not in self._id_offsets:
self._oeb.log.warning('Could not find TOC entry:', node.title)
return -1, -1
offset = self._id_offsets[h]
length = None
# Calculate length based on next entry's offset
for sibling in entries[i+1:]:
h2 = sibling.href
if h2 in self._id_offsets:
offset2 = self._id_offsets[h2]
if offset2 > offset:
length = offset2 - offset
break
if length is None:
length = self._content_length - offset
return offset, length
def _establish_document_structure(self) :
documentType = None
try :
klass = self._ctoc_map[0]['klass']
except :
klass = None
if klass == 'chapter' or klass == None :
documentType = 'book'
if self.opts.verbose > 2 :
self._oeb.logger.info("Adding a MobiBook to self._MobiDoc")
self._MobiDoc.documentStructure = MobiBook()
elif klass == 'periodical' :
documentType = klass
if self.opts.verbose > 2 :
self._oeb.logger.info("Adding a MobiPeriodical to self._MobiDoc")
self._MobiDoc.documentStructure = MobiPeriodical(self._MobiDoc.getNextNode())
self._MobiDoc.documentStructure.startAddress = self._anchor_offset_kindle
else :
raise NotImplementedError('_establish_document_structure: unrecognized klass: %s' % klass)
return documentType
# Index {{{
def _flatten_toc(self):
'''
Flatten and re-order entries in TOC so that chapter to chapter jumping
never fails on the Kindle.
'''
from calibre.ebooks.oeb.base import TOC
items = list(self._oeb.toc.iterdescendants())
if self.opts.mobi_navpoints_only_deepest:
items = [i for i in items if i.depth == 1]
offsets = {i:self._id_offsets.get(i.href, -1) for i in items if i.href}
items = [i for i in items if offsets[i] > -1]
items.sort(key=lambda i:offsets[i])
filt = []
seen = set()
for i in items:
off = offsets[i]
if off in seen: continue
seen.add(off)
filt.append(i)
items = filt
newtoc = TOC()
for c, i in enumerate(items):
newtoc.add(i.title, i.href, play_order=c+1, id=str(c),
klass='chapter')
self._oeb.toc = newtoc
def _generate_index(self):
self._oeb.log('Generating INDX ...')
self._primary_index_record = None
@ -1811,276 +1902,7 @@ class MobiWriter(object):
open(os.path.join(t, n+'.bin'), 'wb').write(self._records[-(i+1)])
self._oeb.log.debug('Index records dumped to', t)
def _clean_text_value(self, text):
if text is not None and text.strip() :
text = text.strip()
if not isinstance(text, unicode):
text = text.decode('utf-8', 'replace')
text = normalize(text).encode('utf-8')
else :
text = "(none)".encode('utf-8')
return text
def _add_to_ctoc(self, ctoc_str, record_offset):
# Write vwilen + string to ctoc
# Return offset
# Is there enough room for this string in the current ctoc record?
if 0xfbf8 - self._ctoc.tell() < 2 + len(ctoc_str):
# flush this ctoc, start a new one
# print "closing ctoc_record at 0x%X" % self._ctoc.tell()
# print "starting new ctoc with '%-50.50s ...'" % ctoc_str
# pad with 00
pad = 0xfbf8 - self._ctoc.tell()
# print "padding %d bytes of 00" % pad
self._ctoc.write('\0' * (pad))
self._ctoc_records.append(self._ctoc.getvalue())
self._ctoc.truncate(0)
self._ctoc_offset += 0x10000
record_offset = self._ctoc_offset
offset = self._ctoc.tell() + record_offset
self._ctoc.write(decint(len(ctoc_str), DECINT_FORWARD) + ctoc_str)
return offset
def _add_flat_ctoc_node(self, node, ctoc, title=None):
# Process 'chapter' or 'article' nodes only, force either to 'chapter'
t = node.title if title is None else title
t = self._clean_text_value(t)
self._last_toc_entry = t
# Create an empty dictionary for this node
ctoc_name_map = {}
# article = chapter
if node.klass == 'article' :
ctoc_name_map['klass'] = 'chapter'
else :
ctoc_name_map['klass'] = node.klass
# Add title offset to name map
ctoc_name_map['titleOffset'] = self._add_to_ctoc(t, self._ctoc_offset)
self._chapterCount += 1
# append this node's name_map to map
self._ctoc_map.append(ctoc_name_map)
return
def _add_structured_ctoc_node(self, node, ctoc, title=None):
# Process 'periodical', 'section' and 'article'
# Fetch the offset referencing the current ctoc_record
if node.klass is None :
return
t = node.title if title is None else title
t = self._clean_text_value(t)
self._last_toc_entry = t
# Create an empty dictionary for this node
ctoc_name_map = {}
# Add the klass of this node
ctoc_name_map['klass'] = node.klass
if node.klass == 'chapter':
# Add title offset to name map
ctoc_name_map['titleOffset'] = self._add_to_ctoc(t, self._ctoc_offset)
self._chapterCount += 1
elif node.klass == 'periodical' :
# Add title offset
ctoc_name_map['titleOffset'] = self._add_to_ctoc(t, self._ctoc_offset)
# Look for existing class entry 'periodical' in _ctoc_map
for entry in self._ctoc_map:
if entry['klass'] == 'periodical':
# Use the pre-existing instance
ctoc_name_map['classOffset'] = entry['classOffset']
break
else :
continue
else:
# class names should always be in CNCX 0 - no offset
ctoc_name_map['classOffset'] = self._add_to_ctoc(node.klass, 0)
self._periodicalCount += 1
elif node.klass == 'section' :
# Add title offset
ctoc_name_map['titleOffset'] = self._add_to_ctoc(t, self._ctoc_offset)
# Look for existing class entry 'section' in _ctoc_map
for entry in self._ctoc_map:
if entry['klass'] == 'section':
# Use the pre-existing instance
ctoc_name_map['classOffset'] = entry['classOffset']
break
else :
continue
else:
# class names should always be in CNCX 0 - no offset
ctoc_name_map['classOffset'] = self._add_to_ctoc(node.klass, 0)
self._sectionCount += 1
elif node.klass == 'article' :
# Add title offset/title
ctoc_name_map['titleOffset'] = self._add_to_ctoc(t, self._ctoc_offset)
# Look for existing class entry 'article' in _ctoc_map
for entry in self._ctoc_map:
if entry['klass'] == 'article':
ctoc_name_map['classOffset'] = entry['classOffset']
break
else :
continue
else:
# class names should always be in CNCX 0 - no offset
ctoc_name_map['classOffset'] = self._add_to_ctoc(node.klass, 0)
# Add description offset/description
if node.description :
d = self._clean_text_value(node.description)
ctoc_name_map['descriptionOffset'] = self._add_to_ctoc(d, self._ctoc_offset)
else :
ctoc_name_map['descriptionOffset'] = None
# Add author offset/attribution
if node.author :
a = self._clean_text_value(node.author)
ctoc_name_map['authorOffset'] = self._add_to_ctoc(a, self._ctoc_offset)
else :
ctoc_name_map['authorOffset'] = None
self._articleCount += 1
else :
raise NotImplementedError( \
'writer._generate_ctoc.add_node: title: %s has unrecognized klass: %s, playOrder: %d' % \
(node.title, node.klass, node.play_order))
# append this node's name_map to map
self._ctoc_map.append(ctoc_name_map)
def _generate_ctoc(self):
# Generate the compiled TOC strings
# Each node has 1-4 CTOC entries:
# Periodical (0xDF)
# title, class
# Section (0xFF)
# title, class
# Article (0x3F)
# title, class, description, author
# Chapter (0x0F)
# title, class
# nb: Chapters don't actually have @class, so we synthesize it
# in reader._toc_from_navpoint
toc = self._oeb.toc
reduced_toc = []
self._ctoc_map = [] # per node dictionary of {class/title/desc/author} offsets
self._last_toc_entry = None
#ctoc = StringIO()
self._ctoc = StringIO()
# Track the individual node types
self._periodicalCount = 0
self._sectionCount = 0
self._articleCount = 0
self._chapterCount = 0
#first = True
if self._conforming_periodical_toc :
self._oeb.logger.info('Generating structured CTOC ...')
for (child) in toc.iter():
if self.opts.verbose > 2 :
self._oeb.logger.info(" %s" % child)
self._add_structured_ctoc_node(child, self._ctoc)
#first = False
else :
self._oeb.logger.info('Generating flat CTOC ...')
previousOffset = -1
currentOffset = 0
for (i, child) in enumerate(toc.iterdescendants()):
# Only add chapters or articles at depth==1
# no class defaults to 'chapter'
if child.klass is None : child.klass = 'chapter'
if (child.klass == 'article' or child.klass == 'chapter') and child.depth() == 1 :
if self.opts.verbose > 2 :
self._oeb.logger.info("adding (klass:%s depth:%d) %s to flat ctoc" % \
(child.klass, child.depth(), child) )
# Test to see if this child's offset is the same as the previous child's
# offset, skip it
h = child.href
if h is None:
self._oeb.logger.warn(' Ignoring TOC entry with no href:',
child.title)
continue
if h not in self._id_offsets:
self._oeb.logger.warn(' Ignoring missing TOC entry:',
unicode(child))
continue
currentOffset = self._id_offsets[h]
# print "_generate_ctoc: child offset: 0x%X" % currentOffset
if currentOffset != previousOffset :
self._add_flat_ctoc_node(child, self._ctoc)
reduced_toc.append(child)
previousOffset = currentOffset
else :
self._oeb.logger.warn(" Ignoring redundant href: %s in '%s'" % (h, child.title))
else :
if self.opts.verbose > 2 :
self._oeb.logger.info("skipping class: %s depth %d at position %d" % \
(child.klass, child.depth(),i))
# Update the TOC with our edited version
self._oeb.toc.nodes = reduced_toc
# Instantiate a MobiDocument(mobitype)
if (not self._periodicalCount and not self._sectionCount and not self._articleCount) or \
not self.opts.mobi_periodical :
mobiType = 0x002
elif self._periodicalCount:
pt = None
if self._oeb.metadata.publication_type:
x = unicode(self._oeb.metadata.publication_type[0]).split(':')
if len(x) > 1:
pt = x[1]
mobiType = {'newspaper':0x101}.get(pt, 0x103)
else :
raise NotImplementedError('_generate_ctoc: Unrecognized document structured')
self._MobiDoc = MobiDocument(mobiType)
if self.opts.verbose > 2 :
structType = 'book'
if mobiType > 0x100 :
structType = 'flat periodical' if mobiType == 0x102 else 'structured periodical'
self._oeb.logger.info("Instantiating a %s MobiDocument of type 0x%X" % (structType, mobiType ) )
if mobiType > 0x100 :
self._oeb.logger.info("periodicalCount: %d sectionCount: %d articleCount: %d"% \
(self._periodicalCount, self._sectionCount, self._articleCount) )
else :
self._oeb.logger.info("chapterCount: %d" % self._chapterCount)
# Apparently the CTOC must end with a null byte
self._ctoc.write('\0')
ctoc = self._ctoc.getvalue()
rec_count = len(self._ctoc_records)
self._oeb.logger.info(" CNCX utilization: %d %s %.0f%% full" % \
(rec_count + 1, 'records, last record' if rec_count else 'record,',
len(ctoc)/655) )
return align_block(ctoc)
# Index nodes {{{
def _write_periodical_node(self, indxt, indices, index, offset, length, count, firstSection, lastSection) :
pos = 0xc0 + indxt.tell()
indices.write(pack('>H', pos)) # Save the offset for IDXTIndices
@ -2172,48 +1994,8 @@ class MobiWriter(object):
indxt.write(decint(self._ctoc_map[index]['titleOffset'], DECINT_FORWARD)) # vwi title offset in CNCX
indxt.write(decint(0, DECINT_FORWARD)) # unknown byte
def _compute_offset_length(self, i, node, entries) :
h = node.href
if h not in self._id_offsets:
self._oeb.log.warning('Could not find TOC entry:', node.title)
return -1, -1
# }}}
offset = self._id_offsets[h]
length = None
# Calculate length based on next entry's offset
for sibling in entries[i+1:]:
h2 = sibling.href
if h2 in self._id_offsets:
offset2 = self._id_offsets[h2]
if offset2 > offset:
length = offset2 - offset
break
if length is None:
length = self._content_length - offset
return offset, length
def _establish_document_structure(self) :
documentType = None
try :
klass = self._ctoc_map[0]['klass']
except :
klass = None
if klass == 'chapter' or klass == None :
documentType = 'book'
if self.opts.verbose > 2 :
self._oeb.logger.info("Adding a MobiBook to self._MobiDoc")
self._MobiDoc.documentStructure = MobiBook()
elif klass == 'periodical' :
documentType = klass
if self.opts.verbose > 2 :
self._oeb.logger.info("Adding a MobiPeriodical to self._MobiDoc")
self._MobiDoc.documentStructure = MobiPeriodical(self._MobiDoc.getNextNode())
self._MobiDoc.documentStructure.startAddress = self._anchor_offset_kindle
else :
raise NotImplementedError('_establish_document_structure: unrecognized klass: %s' % klass)
return documentType
def _generate_section_indices(self, child, currentSection, myPeriodical, myDoc ) :
sectionTitles = list(child.iter())[1:]
@ -2491,6 +2273,270 @@ class MobiWriter(object):
last_name, c = self._add_periodical_structured_articles(myDoc, indxt, indices)
return align_block(indxt.getvalue()), c, align_block(indices.getvalue()), last_name
# }}}
# CTOC {{{
def _add_to_ctoc(self, ctoc_str, record_offset):
# Write vwilen + string to ctoc
# Return offset
# Is there enough room for this string in the current ctoc record?
if 0xfbf8 - self._ctoc.tell() < 2 + len(ctoc_str):
# flush this ctoc, start a new one
# print "closing ctoc_record at 0x%X" % self._ctoc.tell()
# print "starting new ctoc with '%-50.50s ...'" % ctoc_str
# pad with 00
pad = 0xfbf8 - self._ctoc.tell()
# print "padding %d bytes of 00" % pad
self._ctoc.write('\0' * (pad))
self._ctoc_records.append(self._ctoc.getvalue())
self._ctoc.truncate(0)
self._ctoc_offset += 0x10000
record_offset = self._ctoc_offset
offset = self._ctoc.tell() + record_offset
self._ctoc.write(decint(len(ctoc_str), DECINT_FORWARD) + ctoc_str)
return offset
def _add_flat_ctoc_node(self, node, ctoc, title=None):
# Process 'chapter' or 'article' nodes only, force either to 'chapter'
t = node.title if title is None else title
t = self._clean_text_value(t)
self._last_toc_entry = t
# Create an empty dictionary for this node
ctoc_name_map = {}
# article = chapter
if node.klass == 'article' :
ctoc_name_map['klass'] = 'chapter'
else :
ctoc_name_map['klass'] = node.klass
# Add title offset to name map
ctoc_name_map['titleOffset'] = self._add_to_ctoc(t, self._ctoc_offset)
self._chapterCount += 1
# append this node's name_map to map
self._ctoc_map.append(ctoc_name_map)
return
def _add_structured_ctoc_node(self, node, ctoc, title=None):
# Process 'periodical', 'section' and 'article'
# Fetch the offset referencing the current ctoc_record
if node.klass is None :
return
t = node.title if title is None else title
t = self._clean_text_value(t)
self._last_toc_entry = t
# Create an empty dictionary for this node
ctoc_name_map = {}
# Add the klass of this node
ctoc_name_map['klass'] = node.klass
if node.klass == 'chapter':
# Add title offset to name map
ctoc_name_map['titleOffset'] = self._add_to_ctoc(t, self._ctoc_offset)
self._chapterCount += 1
elif node.klass == 'periodical' :
# Add title offset
ctoc_name_map['titleOffset'] = self._add_to_ctoc(t, self._ctoc_offset)
# Look for existing class entry 'periodical' in _ctoc_map
for entry in self._ctoc_map:
if entry['klass'] == 'periodical':
# Use the pre-existing instance
ctoc_name_map['classOffset'] = entry['classOffset']
break
else :
continue
else:
# class names should always be in CNCX 0 - no offset
ctoc_name_map['classOffset'] = self._add_to_ctoc(node.klass, 0)
self._periodicalCount += 1
elif node.klass == 'section' :
# Add title offset
ctoc_name_map['titleOffset'] = self._add_to_ctoc(t, self._ctoc_offset)
# Look for existing class entry 'section' in _ctoc_map
for entry in self._ctoc_map:
if entry['klass'] == 'section':
# Use the pre-existing instance
ctoc_name_map['classOffset'] = entry['classOffset']
break
else :
continue
else:
# class names should always be in CNCX 0 - no offset
ctoc_name_map['classOffset'] = self._add_to_ctoc(node.klass, 0)
self._sectionCount += 1
elif node.klass == 'article' :
# Add title offset/title
ctoc_name_map['titleOffset'] = self._add_to_ctoc(t, self._ctoc_offset)
# Look for existing class entry 'article' in _ctoc_map
for entry in self._ctoc_map:
if entry['klass'] == 'article':
ctoc_name_map['classOffset'] = entry['classOffset']
break
else :
continue
else:
# class names should always be in CNCX 0 - no offset
ctoc_name_map['classOffset'] = self._add_to_ctoc(node.klass, 0)
# Add description offset/description
if node.description :
d = self._clean_text_value(node.description)
ctoc_name_map['descriptionOffset'] = self._add_to_ctoc(d, self._ctoc_offset)
else :
ctoc_name_map['descriptionOffset'] = None
# Add author offset/attribution
if node.author :
a = self._clean_text_value(node.author)
ctoc_name_map['authorOffset'] = self._add_to_ctoc(a, self._ctoc_offset)
else :
ctoc_name_map['authorOffset'] = None
self._articleCount += 1
else :
raise NotImplementedError( \
'writer._generate_ctoc.add_node: title: %s has unrecognized klass: %s, playOrder: %d' % \
(node.title, node.klass, node.play_order))
# append this node's name_map to map
self._ctoc_map.append(ctoc_name_map)
def _generate_ctoc(self):
# Generate the compiled TOC strings
# Each node has 1-4 CTOC entries:
# Periodical (0xDF)
# title, class
# Section (0xFF)
# title, class
# Article (0x3F)
# title, class, description, author
# Chapter (0x0F)
# title, class
# nb: Chapters don't actually have @class, so we synthesize it
# in reader._toc_from_navpoint
toc = self._oeb.toc
reduced_toc = []
self._ctoc_map = [] # per node dictionary of {class/title/desc/author} offsets
self._last_toc_entry = None
#ctoc = StringIO()
self._ctoc = StringIO()
# Track the individual node types
self._periodicalCount = 0
self._sectionCount = 0
self._articleCount = 0
self._chapterCount = 0
#first = True
if self._conforming_periodical_toc :
self._oeb.logger.info('Generating structured CTOC ...')
for (child) in toc.iter():
if self.opts.verbose > 2 :
self._oeb.logger.info(" %s" % child)
self._add_structured_ctoc_node(child, self._ctoc)
#first = False
else :
self._oeb.logger.info('Generating flat CTOC ...')
previousOffset = -1
currentOffset = 0
for (i, child) in enumerate(toc.iterdescendants()):
# Only add chapters or articles at depth==1
# no class defaults to 'chapter'
if child.klass is None : child.klass = 'chapter'
if (child.klass == 'article' or child.klass == 'chapter') and child.depth() == 1 :
if self.opts.verbose > 2 :
self._oeb.logger.info("adding (klass:%s depth:%d) %s to flat ctoc" % \
(child.klass, child.depth(), child) )
# Test to see if this child's offset is the same as the previous child's
# offset, skip it
h = child.href
if h is None:
self._oeb.logger.warn(' Ignoring TOC entry with no href:',
child.title)
continue
if h not in self._id_offsets:
self._oeb.logger.warn(' Ignoring missing TOC entry:',
unicode(child))
continue
currentOffset = self._id_offsets[h]
# print "_generate_ctoc: child offset: 0x%X" % currentOffset
if currentOffset != previousOffset :
self._add_flat_ctoc_node(child, self._ctoc)
reduced_toc.append(child)
previousOffset = currentOffset
else :
self._oeb.logger.warn(" Ignoring redundant href: %s in '%s'" % (h, child.title))
else :
if self.opts.verbose > 2 :
self._oeb.logger.info("skipping class: %s depth %d at position %d" % \
(child.klass, child.depth(),i))
# Update the TOC with our edited version
self._oeb.toc.nodes = reduced_toc
# Instantiate a MobiDocument(mobitype)
if (not self._periodicalCount and not self._sectionCount and not self._articleCount) or \
not self.opts.mobi_periodical :
mobiType = 0x002
elif self._periodicalCount:
pt = None
if self._oeb.metadata.publication_type:
x = unicode(self._oeb.metadata.publication_type[0]).split(':')
if len(x) > 1:
pt = x[1]
mobiType = {'newspaper':0x101}.get(pt, 0x103)
else :
raise NotImplementedError('_generate_ctoc: Unrecognized document structured')
self._MobiDoc = MobiDocument(mobiType)
if self.opts.verbose > 2 :
structType = 'book'
if mobiType > 0x100 :
structType = 'flat periodical' if mobiType == 0x102 else 'structured periodical'
self._oeb.logger.info("Instantiating a %s MobiDocument of type 0x%X" % (structType, mobiType ) )
if mobiType > 0x100 :
self._oeb.logger.info("periodicalCount: %d sectionCount: %d articleCount: %d"% \
(self._periodicalCount, self._sectionCount, self._articleCount) )
else :
self._oeb.logger.info("chapterCount: %d" % self._chapterCount)
# Apparently the CTOC must end with a null byte
self._ctoc.write('\0')
ctoc = self._ctoc.getvalue()
rec_count = len(self._ctoc_records)
self._oeb.logger.info(" CNCX utilization: %d %s %.0f%% full" % \
(rec_count + 1, 'records, last record' if rec_count else 'record,',
len(ctoc)/655) )
return align_block(ctoc)
# }}}
class HTMLRecordData(object):
""" A data structure containing indexing/navigation data for an HTML record """

View File

@ -0,0 +1,16 @@
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import (unicode_literals, division, absolute_import,
print_function)
__license__ = 'GPL v3'
__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
UNCOMPRESSED = 1
PALMDOC = 2
HUFFDIC = 17480
PALM_MAX_IMAGE_SIZE = 63 * 1024
RECORD_SIZE = 0x1000 # 4096 (Text record size (uncompressed))

View File

@ -0,0 +1,856 @@
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import (unicode_literals, division, absolute_import,
print_function)
from future_builtins import filter, map
__license__ = 'GPL v3'
__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
from struct import pack
from cStringIO import StringIO
from collections import OrderedDict, defaultdict
from calibre.ebooks.mobi.writer2 import RECORD_SIZE
from calibre.ebooks.mobi.utils import (encint, encode_number_as_hex,
encode_tbs, align_block, utf8_text)
class CNCX(object): # {{{
'''
Create the CNCX records. These are records containing all the strings from
the NCX. Each record is of the form: <vwi string size><utf-8 encoded
string>
'''
MAX_STRING_LENGTH = 500
def __init__(self, toc, is_periodical):
self.strings = OrderedDict()
for item in toc.iterdescendants(breadth_first=True):
self.strings[item.title] = 0
if is_periodical:
self.strings[item.klass] = 0
aut, desc = item.author, item.description
self.strings[item.author] = self.strings[item.description] = 0
self.records = []
offset = 0
buf = StringIO()
for key in tuple(self.strings.iterkeys()):
utf8 = utf8_text(key[:self.MAX_STRING_LENGTH])
l = len(utf8)
sz_bytes = encint(l)
raw = sz_bytes + utf8
if 0xfbf8 - buf.tell() < 6 + len(raw):
# Records in PDB files cannot be larger than 0x10000, so we
# stop well before that.
pad = 0xfbf8 - self._ctoc.tell()
buf.write(b'\0' * pad)
self.records.append(buf.getvalue())
buf.truncate(0)
offset = len(self.records) * 0x10000
buf.write(raw)
self.strings[key] = offset
offset += len(raw)
self.records.append(align_block(buf.getvalue()))
def __getitem__(self, string):
return self.strings[string]
# }}}
class TAGX(object): # {{{
BITMASKS = {11:0b1}
BITMASKS.update({x:i+1 for i, x in enumerate([1, 2, 3, 4, 5, 21, 22, 23])})
BITMASKS.update({x:i+1 for i, x in enumerate([69, 70, 71, 72, 73])})
NUM_VALUES = defaultdict(lambda x:1)
NUM_VALUES[11] = 3
NUM_VALUES[0] = 0
def __init__(self):
self.byts = bytearray()
def add_tag(self, tag):
buf = self.byts
buf.append(tag)
buf.append(self.NUM_VALUES[tag])
# bitmask
buf.append((1 << (self.BITMASKS[tag])) if tag else 0)
# eof
buf.append(0 if tag else 1)
def header(self, control_byte_count):
header = b'TAGX'
# table length, control byte count
header += pack(b'>II', 12+len(self.byts), control_byte_count)
return header
@property
def periodical(self):
'''
TAGX block for the Primary index header of a periodical
'''
map(self.add_tag, (1, 2, 3, 4, 5, 21, 22, 23, 0, 69, 70, 71, 72, 73, 0))
return self.header(2) + bytes(self.byts)
@property
def secondary(self):
'''
TAGX block for the secondary index header of a periodical
'''
map(self.add_tag, (11, 0))
return self.header(1) + bytes(self.byts)
@property
def flat_book(self):
'''
TAGX block for the primary index header of a flat book
'''
map(self.add_tag, (1, 2, 3, 4, 0))
return self.header(1) + bytes(self.byts)
# }}}
# Index Entries {{{
class IndexEntry(object):
TAG_VALUES = {
'offset': 1,
'size': 2,
'label_offset': 3,
'depth': 4,
'class_offset': 5,
'secondary': 11,
'parent_index': 21,
'first_child_index': 22,
'last_child_index': 23,
'image_index': 69,
'desc_offset': 70,
'author_offset': 73,
}
RTAG_MAP = {v:k for k, v in TAG_VALUES.iteritems()}
def __init__(self, offset, label_offset):
self.offset, self.label_offset = offset, label_offset
self.depth, self.class_offset = 0, None
self.control_byte_count = 1
self.length = 0
self.index = 0
self.parent_index = None
self.first_child_index = None
self.last_child_index = None
self.image_index = None
self.author_offset = None
self.desc_offset = None
def __repr__(self):
return ('IndexEntry(offset=%r, depth=%r, length=%r, index=%r,'
' parent_index=%r)')%(self.offset, self.depth, self.length,
self.index, self.parent_index)
@dynamic_property
def size(self):
def fget(self): return self.length
def fset(self, val): self.length = val
return property(fget=fget, fset=fset, doc='Alias for length')
@property
def next_offset(self):
return self.offset + self.length
@property
def tag_nums(self):
for i in range(1, 5):
yield i
for attr in ('class_offset', 'parent_index', 'first_child_index',
'last_child_index'):
if getattr(self, attr) is not None:
yield self.TAG_VALUES[attr]
@property
def entry_type(self):
ans = 0
for tag in self.tag_nums:
ans |= (1 << (TAGX.BITMASKS[tag])) # 1 << x == 2**x
return ans
@property
def bytestring(self):
buf = StringIO()
if isinstance(self.index, int):
buf.write(encode_number_as_hex(self.index))
else:
raw = bytearray(self.index.encode('ascii'))
raw.insert(0, len(raw))
buf.write(bytes(raw))
et = self.entry_type
buf.write(bytes(bytearray([et])))
if self.control_byte_count == 2:
flags = 0
for attr in ('image_index', 'desc_offset', 'author_offset'):
val = getattr(self, attr)
if val is not None:
tag = self.RTAG_MAP[attr]
bm = TAGX.BITMASKS[tag]
flags |= bm
buf.write(bytes(bytearray([flags])))
for tag in self.tag_nums:
attr = self.RTAG_MAP[tag]
val = getattr(self, attr)
if isinstance(val, int):
val = [val]
for x in val:
buf.write(encint(x))
if self.control_byte_count == 2:
for attr in ('image_index', 'desc_offset', 'author_offset'):
val = getattr(self, attr)
if val is not None:
buf.write(encint(val))
ans = buf.getvalue()
return ans
class PeriodicalIndexEntry(IndexEntry):
def __init__(self, offset, label_offset, class_offset, depth):
IndexEntry.__init__(offset, label_offset)
self.depth = depth
self.class_offset = class_offset
self.control_byte_count = 2
class SecondaryIndexEntry(IndexEntry):
INDEX_MAP = {'author':73, 'caption':72, 'credit':71, 'description':70,
'mastheadImage':69}
def __init__(self, index):
IndexEntry.__init__(self, index, 0)
tag = self.INDEX_MAP[index]
# The values for this index entry
self.secondary = [len(self.INDEX_MAP) if tag == min(
self.INDEX_MAP.itervalues()) else 0, 0, tag]
@property
def tag_nums(self):
yield 11
@property
def entry_type(self):
return 1
@classmethod
def entries(cls):
rmap = {v:k for k,v in cls.INDEX_MAP.iteritems()}
for tag in sorted(rmap, reverse=True):
yield cls(rmap[tag])
# }}}
class TBS(object): # {{{
'''
Take the list of index nodes starting/ending on a record and calculate the
trailing byte sequence for the record.
'''
def __init__(self, data, is_periodical, first=False, section_map={},
after_first=False):
self.section_map = section_map
#import pprint
#pprint.pprint(data)
#print()
if is_periodical:
# The starting bytes.
# The value is zero which I think indicates the periodical
# index entry. The values for the various flags seem to be
# unused. If the 0b100 is present, it means that the record
# deals with section 1 (or is the final record with section
# transitions).
self.type_010 = encode_tbs(0, {0b010: 0}, flag_size=3)
self.type_011 = encode_tbs(0, {0b010: 0, 0b001: 0},
flag_size=3)
self.type_110 = encode_tbs(0, {0b100: 2, 0b010: 0},
flag_size=3)
self.type_111 = encode_tbs(0, {0b100: 2, 0b010: 0, 0b001:
0}, flag_size=3)
if not data:
byts = b''
if after_first:
# This can happen if a record contains only text between
# the periodical start and the first section
byts = self.type_011
self.bytestring = byts
else:
depth_map = defaultdict(list)
for x in ('starts', 'ends', 'completes'):
for idx in data[x]:
depth_map[idx.depth].append(idx)
for l in depth_map.itervalues():
l.sort(key=lambda x:x.offset)
self.periodical_tbs(data, first, depth_map)
else:
if not data:
self.bytestring = b''
else:
self.book_tbs(data, first)
def periodical_tbs(self, data, first, depth_map):
buf = StringIO()
has_section_start = (depth_map[1] and
set(depth_map[1]).intersection(set(data['starts'])))
spanner = data['spans']
parent_section_index = -1
if depth_map[0]:
# We have a terminal record
# Find the first non periodical node
first_node = None
for nodes in (depth_map[1], depth_map[2]):
for node in nodes:
if (first_node is None or (node.offset, node.depth) <
(first_node.offset, first_node.depth)):
first_node = node
typ = (self.type_110 if has_section_start else self.type_010)
# parent_section_index is needed for the last record
if first_node is not None and first_node.depth > 0:
parent_section_index = (first_node.index if first_node.depth
== 1 else first_node.parent_index)
else:
parent_section_index = max(self.section_map.iterkeys())
else:
# Non terminal record
if spanner is not None:
# record is spanned by a single article
parent_section_index = spanner.parent_index
typ = (self.type_110 if parent_section_index == 1 else
self.type_010)
elif not depth_map[1]:
# has only article nodes, i.e. spanned by a section
parent_section_index = depth_map[2][0].parent_index
typ = (self.type_111 if parent_section_index == 1 else
self.type_010)
else:
# has section transitions
if depth_map[2]:
parent_section_index = depth_map[2][0].parent_index
else:
parent_section_index = depth_map[1][0].index
typ = self.type_011
buf.write(typ)
if typ not in (self.type_110, self.type_111) and parent_section_index > 0:
extra = {}
# Write starting section information
if spanner is None:
num_articles = len([a for a in depth_map[1] if a.parent_index
== parent_section_index])
if not depth_map[1]:
extra = {0b0001: 0}
if num_articles > 1:
extra = {0b0100: num_articles}
buf.write(encode_tbs(parent_section_index, extra))
if spanner is None:
articles = depth_map[2]
sections = set([self.section_map[a.parent_index] for a in
articles])
sections = sorted(sections, key=lambda x:x.offset)
section_map = {s:[a for a in articles if a.parent_index ==
s.index] for s in sections}
for i, section in enumerate(sections):
# All the articles in this record that belong to section
articles = section_map[section]
first_article = articles[0]
last_article = articles[-1]
num = len(articles)
try:
next_sec = sections[i+1]
except:
next_sec = None
extra = {}
if num > 1:
extra[0b0100] = num
if False and i == 0 and next_sec is not None:
# Write offset to next section from start of record
# I can't figure out exactly when Kindlegen decides to
# write this so I have disabled it for now.
extra[0b0001] = next_sec.offset - data['offset']
buf.write(encode_tbs(first_article.index-section.index, extra))
if next_sec is not None:
buf.write(encode_tbs(last_article.index-next_sec.index,
{0b1000: 0}))
else:
buf.write(encode_tbs(spanner.index - parent_section_index,
{0b0001: 0}))
self.bytestring = buf.getvalue()
def book_tbs(self, data, first):
self.bytestring = b''
# }}}
class Indexer(object): # {{{
def __init__(self, serializer, number_of_text_records,
size_of_last_text_record, masthead_offset, is_periodical,
opts, oeb):
self.serializer = serializer
self.number_of_text_records = number_of_text_records
self.text_size = (RECORD_SIZE * (self.number_of_text_records-1) +
size_of_last_text_record)
self.masthead_offset = masthead_offset
self.secondary_record_offset = None
self.oeb = oeb
self.log = oeb.log
self.opts = opts
self.is_periodical = is_periodical
if self.is_periodical and self.masthead_offset is None:
raise ValueError('Periodicals must have a masthead')
self.log('Generating MOBI index for a %s'%('periodical' if
self.is_periodical else 'book'))
self.is_flat_periodical = False
if self.is_periodical:
periodical_node = iter(oeb.toc).next()
sections = tuple(periodical_node)
self.is_flat_periodical = len(sections) == 1
self.records = []
if self.is_periodical:
# Ensure all articles have an author and description before
# creating the CNCX
for node in oeb.toc.iterdescendants():
if node.klass == 'article':
aut, desc = node.author, node.description
if not aut: aut = _('Unknown')
if not desc: desc = _('No details available')
node.author, node.description = aut, desc
self.cncx = CNCX(oeb.toc, self.is_periodical)
if self.is_periodical:
self.indices = self.create_periodical_index()
else:
self.indices = self.create_book_index()
self.records.append(self.create_index_record())
self.records.insert(0, self.create_header())
self.records.extend(self.cncx.records)
if is_periodical:
self.secondary_record_offset = len(self.records)
self.records.append(self.create_header(secondary=True))
self.records.append(self.create_index_record(secondary=True))
self.calculate_trailing_byte_sequences()
def create_index_record(self, secondary=False): # {{{
header_length = 192
buf = StringIO()
indices = list(SecondaryIndexEntry.entries) if secondary else self.indices
# Write index entries
offsets = []
for i in indices:
offsets.append(buf.tell())
buf.write(i.bytestring)
index_block = align_block(buf.getvalue())
# Write offsets to index entries as an IDXT block
idxt_block = b'IDXT'
buf.truncate(0)
for offset in offsets:
buf.write(pack(b'>H', header_length+offset))
idxt_block = align_block(idxt_block + buf.getvalue())
body = index_block + idxt_block
header = b'INDX'
buf.truncate(0)
buf.write(pack(b'>I', header_length))
buf.write(b'\0'*4) # Unknown
buf.write(pack(b'>I', 1)) # Header type? Or index record number?
buf.write(b'\0'*4) # Unknown
# IDXT block offset
buf.write(pack(b'>I', header_length + len(index_block)))
# Number of index entries
buf.write(pack(b'>I', len(offsets)))
# Unknown
buf.write(b'\xff'*8)
# Unknown
buf.write(b'\0'*156)
header += buf.getvalue()
ans = header + body
if len(ans) > 0x10000:
raise ValueError('Too many entries (%d) in the TOC'%len(offsets))
return ans
# }}}
def create_header(self, secondary=False): # {{{
buf = StringIO()
if secondary:
tagx_block = TAGX().secondary
else:
tagx_block = (TAGX().periodical if self.is_periodical else
TAGX().flat_book)
header_length = 192
# Ident 0 - 4
buf.write(b'INDX')
# Header length 4 - 8
buf.write(pack(b'>I', header_length))
# Unknown 8-16
buf.write(b'\0'*8)
# Index type: 0 - normal, 2 - inflection 16 - 20
buf.write(pack(b'>I', 2))
# IDXT offset 20-24
buf.write(pack(b'>I', 0)) # Filled in later
# Number of index records 24-28
buf.write(pack(b'>I', 1 if secondary else len(self.records)))
# Index Encoding 28-32
buf.write(pack(b'>I', 65001)) # utf-8
# Unknown 32-36
buf.write(b'\xff'*4)
# Number of index entries 36-40
indices = list(SecondaryIndexEntry.entries) if secondary else self.indices
buf.write(pack(b'>I', len(indices)))
# ORDT offset 40-44
buf.write(pack(b'>I', 0))
# LIGT offset 44-48
buf.write(pack(b'>I', 0))
# Number of LIGT entries 48-52
buf.write(pack(b'>I', 0))
# Number of CNCX records 52-56
buf.write(pack(b'>I', 0 if secondary else len(self.cncx.records)))
# Unknown 56-180
buf.write(b'\0'*124)
# TAGX offset 180-184
buf.write(pack(b'>I', header_length))
# Unknown 184-192
buf.write(b'\0'*8)
# TAGX block
buf.write(tagx_block)
num = len(indices)
# The index of the last entry in the NCX
idx = indices[-1].index
buf.write(encode_number_as_hex(idx) if isinstance(idx, int) else
idx.encode('ascii'))
# The number of entries in the NCX
buf.write(pack(b'>H', num))
# Padding
pad = (4 - (buf.tell()%4))%4
if pad:
buf.write(b'\0'*pad)
idxt_offset = buf.tell()
buf.write(b'IDXT')
buf.write(pack(b'>H', header_length + len(tagx_block)))
buf.write(b'\0')
buf.seek(20)
buf.write(pack(b'>I', idxt_offset))
return align_block(buf.getvalue())
# }}}
def create_book_index(self): # {{{
indices = []
seen = set()
id_offsets = self.serializer.id_offsets
for node in self.oeb.toc.iterdescendants():
try:
offset = id_offsets[node.href]
label = self.cncx[node.title]
except:
self.log.warn('TOC item %s not found in document'%node.href)
continue
if offset in seen:
continue
seen.add(offset)
index = IndexEntry(offset, label)
indices.append(index)
indices.sort(key=lambda x:x.offset)
# Set lengths
for i, index in enumerate(indices):
try:
next_offset = indices[i+1].offset
except:
next_offset = self.serializer.body_end_offset
index.length = next_offset - index.offset
# Remove empty nodes
indices = [i for i in indices if i.length > 0]
# Set index values
for i, index in enumerate(indices):
index.index = i
# Set lengths again to close up any gaps left by filtering
for i, index in enumerate(indices):
try:
next_offset = indices[i+1].offset
except:
next_offset = self.serializer.body_end_offset
index.length = next_offset - index.offset
return indices
# }}}
def create_periodical_index(self): # {{{
periodical_node = iter(self.oeb.toc).next()
periodical_node_offset = self.serializer.body_start_offset
periodical_node_size = (self.serializer.body_end_offset -
periodical_node_offset)
normalized_sections = []
id_offsets = self.serializer.id_offsets
periodical = PeriodicalIndexEntry(periodical_node_offset,
self.cncx[periodical_node.title],
self.cncx[periodical_node.klass], 0)
periodical.length = periodical_node_size
periodical.first_child_index = 1
periodical.image_index = self.masthead_offset
seen_sec_offsets = set()
seen_art_offsets = set()
for sec in periodical_node:
normalized_articles = []
try:
offset = id_offsets[sec.href]
label = self.cncx[sec.title]
klass = self.cncx[sec.klass]
except:
continue
if offset in seen_sec_offsets:
continue
seen_sec_offsets.add(offset)
section = PeriodicalIndexEntry(offset, label, klass, 1)
section.parent_index = 0
for art in sec:
try:
offset = id_offsets[art.href]
label = self.cncx[art.title]
klass = self.cncx[art.klass]
except:
continue
if offset in seen_art_offsets:
continue
seen_art_offsets.add(offset)
article = PeriodicalIndexEntry(offset, label, klass, 2)
normalized_articles.append(article)
article.author_offset = self.cncx[art.author]
article.desc_offset = self.cncx[art.description]
if normalized_articles:
normalized_articles.sort(key=lambda x:x.offset)
normalized_sections.append((section, normalized_articles))
normalized_sections.sort(key=lambda x:x[0].offset)
# Set lengths
for s, x in enumerate(normalized_sections):
sec, normalized_articles = x
try:
sec.length = normalized_sections[s+1][0].offset - sec.offset
except:
sec.length = self.serializer.body_end_offset - sec.offset
for i, art in enumerate(normalized_articles):
try:
art.length = normalized_articles[i+1].offset - art.offset
except:
art.length = sec.offset + sec.length - art.offset
# Filter
for i, x in list(enumerate(normalized_sections)):
sec, normalized_articles = x
normalized_articles = list(filter(lambda x: x.length > 0,
normalized_articles))
normalized_sections[i] = (sec, normalized_articles)
normalized_sections = list(filter(lambda x: x[0].length > 0 and x[1],
normalized_sections))
# Set indices
i = 0
for sec, articles in normalized_sections:
i += 1
sec.index = i
sec.parent_index = 0
for sec, articles in normalized_sections:
for art in articles:
i += 1
art.index = i
art.parent_index = sec.index
for sec, normalized_articles in normalized_sections:
sec.first_child_index = normalized_articles[0].index
sec.last_child_index = normalized_articles[-1].index
# Set lengths again to close up any gaps left by filtering
for s, x in enumerate(normalized_sections):
sec, articles = x
try:
next_offset = normalized_sections[s+1][0].offset
except:
next_offset = self.serializer.body_end_offset
sec.length = next_offset - sec.offset
for a, art in enumerate(articles):
try:
next_offset = articles[a+1].offset
except:
next_offset = sec.next_offset
art.length = next_offset - art.offset
# Sanity check
for s, x in enumerate(normalized_sections):
sec, articles = x
try:
next_sec = normalized_sections[s+1][0]
except:
if (sec.length == 0 or sec.next_offset !=
self.serializer.body_end_offset):
raise ValueError('Invalid section layout')
else:
if next_sec.offset != sec.next_offset or sec.length == 0:
raise ValueError('Invalid section layout')
for a, art in enumerate(articles):
try:
next_art = articles[a+1]
except:
if (art.length == 0 or art.next_offset !=
sec.next_offset):
raise ValueError('Invalid article layout')
else:
if art.length == 0 or art.next_offset != next_art.offset:
raise ValueError('Invalid article layout')
# Flatten
indices = [periodical]
for sec, articles in normalized_sections:
indices.append(sec)
periodical.last_child_index = sec.index
for sec, articles in normalized_sections:
for a in articles:
indices.append(a)
return indices
# }}}
# TBS {{{
def calculate_trailing_byte_sequences(self):
self.tbs_map = {}
found_node = False
sections = [i for i in self.indices if i.depth == 1]
section_map = OrderedDict((i.index, i) for i in
sorted(sections, key=lambda x:x.offset))
deepest = max(i.depth for i in self.indices)
for i in xrange(self.number_of_text_records):
offset = i * RECORD_SIZE
next_offset = offset + RECORD_SIZE
data = {'ends':[], 'completes':[], 'starts':[],
'spans':None, 'offset':offset, 'record_number':i+1}
for index in self.indices:
if index.offset >= next_offset:
# Node starts after current record
if index.depth == deepest:
break
else:
continue
if index.next_offset <= offset:
# Node ends before current record
continue
if index.offset >= offset:
# Node starts in current record
if index.next_offset <= next_offset:
# Node ends in current record
data['completes'].append(index)
else:
data['starts'].append(index)
else:
# Node starts before current records
if index.next_offset <= next_offset:
# Node ends in current record
data['ends'].append(index)
elif index.depth == deepest:
data['spans'] = index
if (data['ends'] or data['completes'] or data['starts'] or
data['spans'] is not None):
self.tbs_map[i+1] = TBS(data, self.is_periodical, first=not
found_node, section_map=section_map)
found_node = True
else:
self.tbs_map[i+1] = TBS({}, self.is_periodical, first=False,
after_first=found_node, section_map=section_map)
def get_trailing_byte_sequence(self, num):
return self.tbs_map[num].bytestring
# }}}
# }}}

View File

@ -0,0 +1,590 @@
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import (unicode_literals, division, absolute_import,
print_function)
__license__ = 'GPL v3'
__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import re, random, time
from cStringIO import StringIO
from struct import pack
from calibre.ebooks import normalize, generate_masthead
from calibre.ebooks.oeb.base import OEB_RASTER_IMAGES
from calibre.ebooks.mobi.writer2.serializer import Serializer
from calibre.ebooks.compression.palmdoc import compress_doc
from calibre.ebooks.mobi.langcodes import iana2mobi
from calibre.utils.filenames import ascii_filename
from calibre.ebooks.mobi.writer2 import (PALMDOC, UNCOMPRESSED, RECORD_SIZE)
from calibre.ebooks.mobi.utils import (rescale_image, encint,
encode_trailing_data, align_block, detect_periodical)
from calibre.ebooks.mobi.writer2.indexer import Indexer
EXTH_CODES = {
'creator': 100,
'publisher': 101,
'description': 103,
'identifier': 104,
'subject': 105,
'pubdate': 106,
'review': 107,
'contributor': 108,
'rights': 109,
'type': 111,
'source': 112,
'versionnumber': 114,
'startreading': 116,
'coveroffset': 201,
'thumboffset': 202,
'hasfakecover': 203,
'lastupdatetime': 502,
'title': 503,
}
# Disabled as I dont care about uncrossable breaks
WRITE_UNCROSSABLE_BREAKS = False
MAX_THUMB_SIZE = 16 * 1024
MAX_THUMB_DIMEN = (180, 240)
class MobiWriter(object):
COLLAPSE_RE = re.compile(r'[ \t\r\n\v]+')
def __init__(self, opts, write_page_breaks_after_item=True):
self.opts = opts
self.write_page_breaks_after_item = write_page_breaks_after_item
self.compression = UNCOMPRESSED if opts.dont_compress else PALMDOC
self.prefer_author_sort = opts.prefer_author_sort
self.last_text_record_idx = 1
def __call__(self, oeb, path_or_stream):
self.log = oeb.log
if hasattr(path_or_stream, 'write'):
return self.dump_stream(oeb, path_or_stream)
with open(path_or_stream, 'w+b') as stream:
return self.dump_stream(oeb, stream)
def write(self, *args):
for datum in args:
self.stream.write(datum)
def tell(self):
return self.stream.tell()
def dump_stream(self, oeb, stream):
self.oeb = oeb
self.stream = stream
self.records = [None]
self.generate_content()
self.generate_record0()
self.write_header()
self.write_content()
def generate_content(self):
self.is_periodical = detect_periodical(self.oeb.toc, self.oeb.log)
# Image records are stored in their own list, they are merged into the
# main record list at the end
self.generate_images()
self.generate_text()
# The uncrossable breaks trailing entries come before the indexing
# trailing entries
self.write_uncrossable_breaks()
# Index records come after text records
self.generate_index()
# Indexing {{{
def generate_index(self):
self.primary_index_record_idx = None
try:
self.indexer = Indexer(self.serializer, self.last_text_record_idx,
len(self.records[self.last_text_record_idx]),
self.masthead_offset, self.is_periodical,
self.opts, self.oeb)
except:
self.log.exception('Failed to generate MOBI index:')
else:
self.primary_index_record_idx = len(self.records)
for i in xrange(len(self.records)):
if i == 0: continue
tbs = self.indexer.get_trailing_byte_sequence(i)
self.records[i] += encode_trailing_data(tbs)
self.records.extend(self.indexer.records)
# }}}
def write_uncrossable_breaks(self): # {{{
'''
Write information about uncrossable breaks (non linear items in
the spine.
'''
if not WRITE_UNCROSSABLE_BREAKS:
return
breaks = self.serializer.breaks
for i in xrange(1, self.last_text_record_idx+1):
offset = i * RECORD_SIZE
pbreak = 0
running = offset
buf = StringIO()
while breaks and (breaks[0] - offset) < RECORD_SIZE:
pbreak = (breaks.pop(0) - running) >> 3
encoded = encint(pbreak)
buf.write(encoded)
running += pbreak << 3
encoded = encode_trailing_data(buf.getvalue())
self.records[i] += encoded
# }}}
# Images {{{
def generate_images(self):
oeb = self.oeb
oeb.logger.info('Serializing images...')
self.image_records = []
mh_href = self.masthead_offset = None
if 'masthead' in oeb.guide:
mh_href = oeb.guide['masthead'].href
elif self.is_periodical:
# Generate a default masthead
data = generate_masthead(unicode(self.oeb.metadata('title')[0]))
self.image_records.append(data)
self.masthead_offset = 0
cover_href = self.cover_offset = self.thumbnail_offset = None
if (oeb.metadata.cover and
unicode(oeb.metadata.cover[0]) in oeb.manifest.ids):
cover_id = unicode(oeb.metadata.cover[0])
item = oeb.manifest.ids[cover_id]
cover_href = item.href
for item in self.oeb.manifest.values():
if item.media_type not in OEB_RASTER_IMAGES: continue
try:
data = rescale_image(item.data)
except:
oeb.logger.warn('Bad image file %r' % item.href)
continue
else:
if item.href == mh_href:
self.masthead_offset = len(self.image_records) - 1
elif item.href == cover_href:
self.image_records.append(data)
self.cover_offset = len(self.image_records) - 1
try:
data = rescale_image(item.data, dimen=MAX_THUMB_DIMEN,
maxsizeb=MAX_THUMB_SIZE)
except:
oeb.logger.warn('Failed to generate thumbnail')
else:
self.image_records.append(data)
self.thumbnail_offset = len(self.image_records) - 1
finally:
item.unload_data_from_memory()
# }}}
# Text {{{
def generate_text(self):
self.oeb.logger.info('Serializing markup content...')
self.serializer = Serializer(self.oeb, self.images,
write_page_breaks_after_item=self.write_page_breaks_after_item)
text = self.serializer()
self.text_length = len(text)
text = StringIO(text)
nrecords = 0
records_size = 0
if self.compression != UNCOMPRESSED:
self.oeb.logger.info(' Compressing markup content...')
while text.tell() < self.text_length:
data, overlap = self.read_text_record(text)
if self.compression == PALMDOC:
data = compress_doc(data)
data += overlap
data += pack(b'>B', len(overlap))
self.records.append(data)
records_size += len(data)
nrecords += 1
self.last_text_record_idx = nrecords
self.first_non_text_record_idx = nrecords + 1
# Pad so that the next records starts at a 4 byte boundary
if records_size % 4 != 0:
self.records.append(b'\x00'*(records_size % 4))
self.first_non_text_record_idx += 1
def read_text_record(self, text):
'''
Return a Palmdoc record of size RECORD_SIZE from the text file object.
In case the record ends in the middle of a multibyte character return
the overlap as well.
Returns data, overlap: where both are byte strings. overlap is the
extra bytes needed to complete the truncated multibyte character.
'''
opos = text.tell()
text.seek(0, 2)
# npos is the position of the next record
npos = min((opos + RECORD_SIZE, text.tell()))
# Number of bytes from the next record needed to complete the last
# character in this record
extra = 0
last = b''
while not last.decode('utf-8', 'ignore'):
# last contains no valid utf-8 characters
size = len(last) + 1
text.seek(npos - size)
last = text.read(size)
# last now has one valid utf-8 char and possibly some bytes that belong
# to a truncated char
try:
last.decode('utf-8', 'strict')
except UnicodeDecodeError:
# There are some truncated bytes in last
prev = len(last)
while True:
text.seek(npos - prev)
last = text.read(len(last) + 1)
try:
last.decode('utf-8')
except UnicodeDecodeError:
pass
else:
break
extra = len(last) - prev
text.seek(opos)
data = text.read(RECORD_SIZE)
overlap = text.read(extra)
text.seek(npos)
return data, overlap
# }}}
def generate_record0(self): # MOBI header {{{
metadata = self.oeb.metadata
exth = self.build_exth()
first_image_record = None
if self.image_records:
first_image_record = len(self.records)
self.records.extend(self.image_records)
last_content_record = len(self.records) - 1
# FCIS/FLIS (Seems to serve no purpose)
flis_number = len(self.records)
self.records.append(
b'FLIS\0\0\0\x08\0\x41\0\0\0\0\0\0\xff\xff\xff\xff\0\x01\0\x03\0\0\0\x03\0\0\0\x01'+
b'\xff'*4)
fcis = b'FCIS\x00\x00\x00\x14\x00\x00\x00\x10\x00\x00\x00\x01\x00\x00\x00\x00'
fcis += pack(b'>I', self.text_length)
fcis += b'\x00\x00\x00\x00\x00\x00\x00\x20\x00\x00\x00\x08\x00\x01\x00\x01\x00\x00\x00\x00'
fcis_number = len(self.records)
self.records.append(fcis)
# EOF record
self.records.append(b'\xE9\x8E\x0D\x0A')
record0 = StringIO()
# The MOBI Header
record0.write(pack(b'>HHIHHHH',
self.compression, # compression type # compression type
0, # Unused
self.text_length, # Text length
self.last_text_record_idx, # Number of text records or last tr idx
RECORD_SIZE, # Text record size
0, # Unused
0 # Unused
)) # 0 - 15 (0x0 - 0xf)
uid = random.randint(0, 0xffffffff)
title = normalize(unicode(metadata.title[0])).encode('utf-8')
# 0x0 - 0x3
record0.write(b'MOBI')
# 0x4 - 0x7 : Length of header
# 0x8 - 0x11 : MOBI type
# type meaning
# 0x002 MOBI book (chapter - chapter navigation)
# 0x101 News - Hierarchical navigation with sections and articles
# 0x102 News feed - Flat navigation
# 0x103 News magazine - same as 0x101
# 0xC - 0xF : Text encoding (65001 is utf-8)
# 0x10 - 0x13 : UID
# 0x14 - 0x17 : Generator version
bt = 0x002
if self.primary_index_record_idx is not None:
if self.indexer.is_flat_periodical:
bt = 0x102
elif self.indexer.is_periodical:
bt = 0x101
record0.write(pack(b'>IIIII',
0xe8, bt, 65001, uid, 6))
# 0x18 - 0x1f : Unknown
record0.write(b'\xff' * 8)
# 0x20 - 0x23 : Secondary index record
sir = 0xffffffff
if (self.primary_index_record_idx is not None and
self.indexer.secondary_record_offset is not None):
sir = (self.primary_index_record_idx +
self.indexer.secondary_record_offset)
record0.write(pack(b'>I', sir))
# 0x24 - 0x3f : Unknown
record0.write(b'\xff' * 28)
# 0x40 - 0x43 : Offset of first non-text record
record0.write(pack(b'>I',
self.first_non_text_record_idx))
# 0x44 - 0x4b : title offset, title length
record0.write(pack(b'>II',
0xe8 + 16 + len(exth), len(title)))
# 0x4c - 0x4f : Language specifier
record0.write(iana2mobi(
str(metadata.language[0])))
# 0x50 - 0x57 : Input language and Output language
record0.write(b'\0' * 8)
# 0x58 - 0x5b : Format version
# 0x5c - 0x5f : First image record number
record0.write(pack(b'>II',
6, first_image_record if first_image_record else len(self.records)))
# 0x60 - 0x63 : First HUFF/CDIC record number
# 0x64 - 0x67 : Number of HUFF/CDIC records
# 0x68 - 0x6b : First DATP record number
# 0x6c - 0x6f : Number of DATP records
record0.write(b'\0' * 16)
# 0x70 - 0x73 : EXTH flags
# Bit 6 (0b1000000) being set indicates the presence of an EXTH header
# The purpose of the other bits is unknown
exth_flags = 0b1010000
if self.is_periodical:
exth_flags |= 0b1000
record0.write(pack(b'>I', exth_flags))
# 0x74 - 0x93 : Unknown
record0.write(b'\0' * 32)
# 0x94 - 0x97 : DRM offset
# 0x98 - 0x9b : DRM count
# 0x9c - 0x9f : DRM size
# 0xa0 - 0xa3 : DRM flags
record0.write(pack(b'>IIII',
0xffffffff, 0xffffffff, 0, 0))
# 0xa4 - 0xaf : Unknown
record0.write(b'\0'*12)
# 0xb0 - 0xb1 : First content record number
# 0xb2 - 0xb3 : last content record number
# (Includes Image, DATP, HUFF, DRM)
record0.write(pack(b'>HH', 1, last_content_record))
# 0xb4 - 0xb7 : Unknown
record0.write(b'\0\0\0\x01')
# 0xb8 - 0xbb : FCIS record number
record0.write(pack(b'>I', fcis_number))
# 0xbc - 0xbf : Unknown (FCIS record count?)
record0.write(pack(b'>I', 1))
# 0xc0 - 0xc3 : FLIS record number
record0.write(pack(b'>I', flis_number))
# 0xc4 - 0xc7 : Unknown (FLIS record count?)
record0.write(pack(b'>I', 1))
# 0xc8 - 0xcf : Unknown
record0.write(b'\0'*8)
# 0xd0 - 0xdf : Unknown
record0.write(pack(b'>IIII', 0xffffffff, 0, 0xffffffff, 0xffffffff))
# 0xe0 - 0xe3 : Extra record data
# Extra record data flags:
# - 0b1 : <extra multibyte bytes><size>
# - 0b10 : <TBS indexing description of this HTML record><size>
# - 0b100: <uncrossable breaks><size>
# Setting bit 2 (0x2) disables <guide><reference type="start"> functionality
extra_data_flags = 0b1 # Has multibyte overlap bytes
if self.primary_index_record_idx is not None:
extra_data_flags |= 0b10
if WRITE_UNCROSSABLE_BREAKS:
extra_data_flags |= 0b100
record0.write(pack(b'>I', extra_data_flags))
# 0xe4 - 0xe7 : Primary index record
record0.write(pack(b'>I', 0xffffffff if self.primary_index_record_idx
is None else self.primary_index_record_idx))
record0.write(exth)
record0.write(title)
record0 = record0.getvalue()
# Add some buffer so that Amazon can add encryption information if this
# MOBI is submitted for publication
record0 += (b'\0' * (1024*8))
self.records[0] = align_block(record0)
# }}}
def build_exth(self): # EXTH Header {{{
oeb = self.oeb
exth = StringIO()
nrecs = 0
for term in oeb.metadata:
if term not in EXTH_CODES: continue
code = EXTH_CODES[term]
items = oeb.metadata[term]
if term == 'creator':
if self.prefer_author_sort:
creators = [normalize(unicode(c.file_as or c)) for c in items]
else:
creators = [normalize(unicode(c)) for c in items]
items = ['; '.join(creators)]
for item in items:
data = self.COLLAPSE_RE.sub(' ', normalize(unicode(item)))
if term == 'identifier':
if data.lower().startswith('urn:isbn:'):
data = data[9:]
elif item.scheme.lower() == 'isbn':
pass
else:
continue
data = data.encode('utf-8')
exth.write(pack(b'>II', code, len(data) + 8))
exth.write(data)
nrecs += 1
if term == 'rights' :
try:
rights = normalize(unicode(oeb.metadata.rights[0])).encode('utf-8')
except:
rights = b'Unknown'
exth.write(pack(b'>II', EXTH_CODES['rights'], len(rights) + 8))
exth.write(rights)
nrecs += 1
# Write UUID as ASIN
uuid = None
from calibre.ebooks.oeb.base import OPF
for x in oeb.metadata['identifier']:
if (x.get(OPF('scheme'), None).lower() == 'uuid' or
unicode(x).startswith('urn:uuid:')):
uuid = unicode(x).split(':')[-1]
break
if uuid is None:
from uuid import uuid4
uuid = str(uuid4())
if isinstance(uuid, unicode):
uuid = uuid.encode('utf-8')
exth.write(pack(b'>II', 113, len(uuid) + 8))
exth.write(uuid)
nrecs += 1
# Write cdetype
if self.is_periodical:
data = b'NWPR'
else:
data = b'EBOK'
exth.write(pack(b'>II', 501, len(data)+8))
exth.write(data)
nrecs += 1
# Add a publication date entry
if oeb.metadata['date']:
datestr = str(oeb.metadata['date'][0])
elif oeb.metadata['timestamp']:
datestr = str(oeb.metadata['timestamp'][0])
if datestr is None:
raise ValueError("missing date or timestamp")
datestr = bytes(datestr)
exth.write(pack(b'>II', EXTH_CODES['pubdate'], len(datestr) + 8))
exth.write(datestr)
nrecs += 1
if self.is_periodical:
exth.write(pack(b'>II', EXTH_CODES['lastupdatetime'], len(datestr) + 8))
exth.write(datestr)
nrecs += 1
exth.write(pack(b'>III', EXTH_CODES['versionnumber'], 12, 7))
nrecs += 1
if self.is_periodical:
# Pretend to be amazon's super secret periodical generator
vals = {204:201, 205:2, 206:0, 207:101}
else:
# Pretend to be kindlegen 1.2
vals = {204:201, 205:1, 206:2, 207:33307}
for code, val in vals:
exth.write(pack(b'>III', code, 12, val))
nrecs += 1
if self.cover_offset is not None:
exth.write(pack(b'>III', EXTH_CODES['coveroffset'], 12,
self.cover_offset))
exth.write(pack(b'>III', EXTH_CODES['hasfakecover'], 12, 0))
nrecs += 2
if self.thumbnail_offset is not None:
exth.write(pack(b'>III', EXTH_CODES['thumboffset'], 12,
self.thumbnail_offset))
nrecs += 1
if self.serializer.start_offset is not None:
exth.write(pack(b'>III', EXTH_CODES['startreading'], 12,
self.serializer.start_offset))
nrecs += 1
exth = exth.getvalue()
trail = len(exth) % 4
pad = b'\0' * (4 - trail) # Always pad w/ at least 1 byte
exth = [b'EXTH', pack(b'>II', len(exth) + 12, nrecs), exth, pad]
return b''.join(exth)
# }}}
def write_header(self): # PalmDB header {{{
'''
Write the PalmDB header
'''
title = ascii_filename(unicode(self.oeb.metadata.title[0])).replace(
' ', '_')
title = title + (b'\0' * (32 - len(title)))
now = int(time.time())
nrecords = len(self.records)
self.write(title, pack(b'>HHIIIIII', 0, 0, now, now, 0, 0, 0, 0),
b'BOOK', b'MOBI', pack(b'>IIH', (2*nrecords)-1, 0, nrecords))
offset = self.tell() + (8 * nrecords) + 2
for i, record in enumerate(self.records):
self.write(pack(b'>I', offset), b'\0', pack(b'>I', 2*i)[1:])
offset += len(record)
self.write(b'\0\0')
# }}}
def write_content(self):
for record in self.records:
self.write(record)

View File

@ -0,0 +1,306 @@
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import (unicode_literals, division, absolute_import,
print_function)
__license__ = 'GPL v3'
__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
from calibre.ebooks.oeb.base import (OEB_DOCS, XHTML, XHTML_NS, XML_NS,
namespace, prefixname, urlnormalize)
from calibre.ebooks.mobi.mobiml import MBP_NS
from collections import defaultdict
from urlparse import urldefrag
from cStringIO import StringIO
class Serializer(object):
NSRMAP = {'': None, XML_NS: 'xml', XHTML_NS: '', MBP_NS: 'mbp'}
def __init__(self, oeb, images, write_page_breaks_after_item=True):
'''
Write all the HTML markup in oeb into a single in memory buffer
containing a single html document with links replaced by offsets into
the buffer.
:param oeb: OEBBook object that encapsulates the document to be
processed.
:param images: Mapping of image hrefs (urlnormalized) to image record
indices.
:param write_page_breaks_after_item: If True a MOBIpocket pagebreak tag
is written after every element of the spine in ``oeb``.
'''
self.oeb = oeb
self.images = images
self.logger = oeb.logger
self.write_page_breaks_after_item = write_page_breaks_after_item
# If not None, this is a number pointing to the location at which to
# open the MOBI file on the Kindle
self.start_offset = None
# Mapping of hrefs (urlnormalized) to the offset in the buffer where
# the resource pointed to by the href lives. Used at the end to fill in
# the correct values into all filepos="..." links.
self.id_offsets = {}
# Mapping of hrefs (urlnormalized) to a list of offsets into the buffer
# where filepos="..." elements are written corresponding to links that
# point to the href. This is used at the end to fill in the correct values.
self.href_offsets = defaultdict(list)
# List of offsets in the buffer of non linear items in the spine. These
# become uncrossable breaks in the MOBI
self.breaks = []
self.find_blocks()
def find_blocks(self):
'''
Mark every item in the spine if it is the start/end of a
section/article, so that it can be wrapped in divs appropriately.
'''
for item in self.oeb.spine:
item.is_section_start = item.is_section_end = False
item.is_article_start = item.is_article_end = False
def spine_item(tocitem):
href = urldefrag(tocitem.href)[0]
for item in self.oeb.spine:
if item.href == href:
return item
for item in self.oeb.toc.iterdescendants():
if item.klass == 'section':
articles = list(item)
if not articles: continue
spine_item(item).is_section_start = True
for i, article in enumerate(articles):
si = spine_item(article)
si.is_article_start = True
items = list(self.oeb.spine)
in_sec = in_art = False
for i, item in enumerate(items):
try:
prev_item = items[i-1]
except:
prev_item = None
if in_art and item.is_article_start == True:
prev_item.is_article_end = True
in_art = False
if in_sec and item.is_section_start == True:
prev_item.is_section_end = True
in_sec = False
if item.is_section_start: in_sec = True
if item.is_article_start: in_art = True
item.is_section_end = item.is_article_end = True
def __call__(self):
'''
Return the document serialized as a single UTF-8 encoded bytestring.
'''
buf = self.buf = StringIO()
buf.write(b'<html>')
self.serialize_head()
self.serialize_body()
buf.write(b'</html>')
self.fixup_links()
return buf.getvalue()
def serialize_head(self):
buf = self.buf
buf.write(b'<head>')
if len(self.oeb.guide) > 0:
self.serialize_guide()
buf.write(b'</head>')
def serialize_guide(self):
'''
The Kindle decides where to open a book based on the presence of
an item in the guide that looks like
<reference type="text" title="Start" href="chapter-one.xhtml"/>
Similarly an item with type="toc" controls where the Goto Table of
Contents operation on the kindle goes.
'''
buf = self.buf
hrefs = self.oeb.manifest.hrefs
buf.write(b'<guide>')
for ref in self.oeb.guide.values():
path = urldefrag(ref.href)[0]
if path not in hrefs or hrefs[path].media_type not in OEB_DOCS:
continue
buf.write(b'<reference type="')
if ref.type.startswith('other.') :
self.serialize_text(ref.type.replace('other.',''), quot=True)
else:
self.serialize_text(ref.type, quot=True)
buf.write(b'" ')
if ref.title is not None:
buf.write(b'title="')
self.serialize_text(ref.title, quot=True)
buf.write(b'" ')
if ref.title == 'start':
self._start_href = ref.href
self.serialize_href(ref.href)
# Space required or won't work, I kid you not
buf.write(b' />')
buf.write(b'</guide>')
def serialize_href(self, href, base=None):
'''
Serialize the href attribute of an <a> or <reference> tag. It is
serialized as filepos="000000000" and a pointer to its location is
stored in self.href_offsets so that the correct value can be filled in
at the end.
'''
hrefs = self.oeb.manifest.hrefs
path, frag = urldefrag(urlnormalize(href))
if path and base:
path = base.abshref(path)
if path and path not in hrefs:
return False
buf = self.buf
item = hrefs[path] if path else None
if item and item.spine_position is None:
return False
path = item.href if item else base.href
href = '#'.join((path, frag)) if frag else path
buf.write(b'filepos=')
self.href_offsets[href].append(buf.tell())
buf.write(b'0000000000')
return True
def serialize_body(self):
'''
Serialize all items in the spine of the document. Non linear items are
moved to the end.
'''
buf = self.buf
self.anchor_offset = buf.tell()
buf.write(b'<body>')
self.body_start_offset = buf.tell()
spine = [item for item in self.oeb.spine if item.linear]
spine.extend([item for item in self.oeb.spine if not item.linear])
for item in spine:
self.serialize_item(item)
self.body_end_offset = buf.tell()
buf.write(b'</body>')
def serialize_item(self, item):
'''
Serialize an individual item from the spine of the input document.
A reference to this item is stored in self.href_offsets
'''
buf = self.buf
if not item.linear:
self.breaks.append(buf.tell() - 1)
self.id_offsets[urlnormalize(item.href)] = buf.tell()
if item.is_section_start:
buf.write(b'<div>')
if item.is_article_start:
buf.write(b'<div>')
for elem in item.data.find(XHTML('body')):
self.serialize_elem(elem, item)
if item.is_article_end:
# Kindle periodical article end marker
buf.write(b'<div></div>')
if self.write_page_breaks_after_item:
buf.write(b'<mbp:pagebreak/>')
if item.is_article_end:
buf.write(b'</div>')
if item.is_section_end:
buf.write(b'</div>')
self.anchor_offset = None
def serialize_elem(self, elem, item, nsrmap=NSRMAP):
buf = self.buf
if not isinstance(elem.tag, basestring) \
or namespace(elem.tag) not in nsrmap:
return
tag = prefixname(elem.tag, nsrmap)
# Previous layers take care of @name
id_ = elem.attrib.pop('id', None)
if id_:
href = '#'.join((item.href, id_))
offset = self.anchor_offset or buf.tell()
self.id_offsets[urlnormalize(href)] = offset
if self.anchor_offset is not None and \
tag == 'a' and not elem.attrib and \
not len(elem) and not elem.text:
return
self.anchor_offset = buf.tell()
buf.write(b'<')
buf.write(tag.encode('utf-8'))
if elem.attrib:
for attr, val in elem.attrib.items():
if namespace(attr) not in nsrmap:
continue
attr = prefixname(attr, nsrmap)
buf.write(b' ')
if attr == 'href':
if self.serialize_href(val, item):
continue
elif attr == 'src':
href = urlnormalize(item.abshref(val))
if href in self.images:
index = self.images[href]
buf.write(b'recindex="%05d"' % index)
continue
buf.write(attr.encode('utf-8'))
buf.write(b'="')
self.serialize_text(val, quot=True)
buf.write(b'"')
buf.write(b'>')
if elem.text or len(elem) > 0:
if elem.text:
self.anchor_offset = None
self.serialize_text(elem.text)
for child in elem:
self.serialize_elem(child, item)
if child.tail:
self.anchor_offset = None
self.serialize_text(child.tail)
buf.write(b'</%s>' % tag.encode('utf-8'))
def serialize_text(self, text, quot=False):
text = text.replace('&', '&amp;')
text = text.replace('<', '&lt;')
text = text.replace('>', '&gt;')
text = text.replace(u'\u00AD', '') # Soft-hyphen
if quot:
text = text.replace('"', '&quot;')
self.buf.write(text.encode('utf-8'))
def fixup_links(self):
'''
Fill in the correct values for all filepos="..." links with the offsets
of the linked to content (as stored in id_offsets).
'''
buf = self.buf
id_offsets = self.id_offsets
for href, hoffs in self.href_offsets.items():
is_start = (href and href == getattr(self, '_start_href', None))
# Iterate over all filepos items
if href not in id_offsets:
self.logger.warn('Hyperlink target %r not found' % href)
# Link to the top of the document, better than just ignoring
href, _ = urldefrag(href)
if href in self.id_offsets:
ioff = self.id_offsets[href]
if is_start:
self.start_offset = ioff
for hoff in hoffs:
buf.seek(hoff)
buf.write(b'%010d' % ioff)

View File

@ -1180,8 +1180,9 @@ class Manifest(object):
if memory is None:
from calibre.ptempfile import PersistentTemporaryFile
pt = PersistentTemporaryFile(suffix='_oeb_base_mem_unloader.img')
with pt:
pt.write(self._data)
pt.close()
self.oeb._temp_files.append(pt.name)
def loader(*args):
with open(pt.name, 'rb') as f:
ans = f.read()
@ -1196,8 +1197,6 @@ class Manifest(object):
self._loader = loader2
self._data = None
def __str__(self):
data = self.data
if isinstance(data, etree._Element):
@ -1681,8 +1680,15 @@ class TOC(object):
return True
return False
def iterdescendants(self):
def iterdescendants(self, breadth_first=False):
"""Iterate over all descendant nodes in depth-first order."""
if breadth_first:
for child in self.nodes:
yield child
for child in self.nodes:
for node in child.iterdescendants(breadth_first=True):
yield node
else:
for child in self.nodes:
for node in child.iter():
yield node
@ -1913,6 +1919,14 @@ class OEBBook(object):
self.toc = TOC()
self.pages = PageList()
self.auto_generated_toc = True
self._temp_files = []
def clean_temp_files(self):
for path in self._temp_files:
try:
os.remove(path)
except:
pass
@classmethod
def generate(cls, opts):

View File

@ -92,7 +92,7 @@ class EbookIterator(object):
self.config = DynamicConfig(name='iterator')
ext = os.path.splitext(pathtoebook)[1].replace('.', '').lower()
ext = re.sub(r'(x{0,1})htm(l{0,1})', 'html', ext)
self.ebook_ext = ext
self.ebook_ext = ext.replace('original_', '')
def search(self, text, index, backwards=False):
text = text.lower()

View File

@ -163,6 +163,8 @@ class OEBReader(object):
if item.media_type in check:
try:
item.data
except KeyboardInterrupt:
raise
except:
self.logger.exception('Failed to parse content in %s'%
item.href)
@ -186,8 +188,13 @@ class OEBReader(object):
href, _ = urldefrag(href)
if not href:
continue
try:
href = item.abshref(urlnormalize(href))
scheme = urlparse(href).scheme
except:
self.oeb.log.exception(
'Skipping invalid href: %r'%href)
continue
if not scheme and href not in known:
new.add(href)
elif item.media_type in OEB_STYLES:

View File

@ -318,7 +318,8 @@ class CSSFlattener(object):
for edge in ('top', 'bottom'):
cssdict['%s-%s'%(prop, edge)] = '0pt'
if self.context.insert_blank_line:
cssdict['margin-top'] = cssdict['margin-bottom'] = '0.5em'
cssdict['margin-top'] = cssdict['margin-bottom'] = \
'%fem'%self.context.insert_blank_line_size
if self.context.remove_paragraph_spacing:
cssdict['text-indent'] = "%1.1fem" % self.context.remove_paragraph_spacing_indent_size

View File

@ -36,5 +36,8 @@ class Clean(object):
href = urldefrag(self.oeb.guide[x].href)[0]
if x.lower() not in ('cover', 'titlepage', 'masthead', 'toc',
'title-page', 'copyright-page', 'start'):
item = self.oeb.guide[x]
if item.title and item.title.lower() == 'start':
continue
self.oeb.guide.remove(x)

View File

@ -45,9 +45,10 @@ body > .calibre_toc_block {
}
class HTMLTOCAdder(object):
def __init__(self, title=None, style='nested'):
def __init__(self, title=None, style='nested', position='end'):
self.title = title
self.style = style
self.position = position
@classmethod
def config(cls, cfg):
@ -98,7 +99,10 @@ class HTMLTOCAdder(object):
self.add_toc_level(body, oeb.toc)
id, href = oeb.manifest.generate('contents', 'contents.xhtml')
item = oeb.manifest.add(id, href, XHTML_MIME, data=contents)
if self.position == 'end':
oeb.spine.add(item, linear=False)
else:
oeb.spine.insert(0, item, linear=True)
oeb.guide.add('toc', 'Table of Contents', href)
def add_toc_level(self, elem, toc):

View File

@ -47,15 +47,19 @@ def meta_info_to_oeb_metadata(mi, m, log, override_input_metadata=False):
m.add('series', mi.series)
elif override_input_metadata:
m.clear('series')
if not mi.is_null('isbn'):
identifiers = mi.get_identifiers()
set_isbn = False
for typ, val in identifiers.iteritems():
has = False
if typ.lower() == 'isbn':
set_isbn = True
for x in m.identifier:
if x.scheme.lower() == 'isbn':
x.content = mi.isbn
if x.scheme.lower() == typ.lower():
x.content = val
has = True
if not has:
m.add('identifier', mi.isbn, scheme='ISBN')
elif override_input_metadata:
m.add('identifier', val, scheme=typ.upper())
if override_input_metadata and not set_isbn:
m.filter('identifier', lambda x: x.scheme.lower() == 'isbn')
if not mi.is_null('language'):
m.clear('language')

View File

@ -47,7 +47,10 @@ class ManifestTrimmer(object):
item.data is not None:
hrefs = [r[2] for r in iterlinks(item.data)]
for href in hrefs:
try:
href = item.abshref(urlnormalize(href))
except:
continue
if href in oeb.manifest.hrefs:
found = oeb.manifest.hrefs[href]
if found not in used:

View File

@ -165,6 +165,7 @@ class PDFWriter(QObject): # {{{
printer = get_pdf_printer(self.opts)
printer.setOutputFileName(item_path)
self.view.print_(printer)
printer.abort()
self._render_book()
def _delete_tmpdir(self):
@ -186,6 +187,7 @@ class PDFWriter(QObject): # {{{
draw_image_page(printer, painter, p,
preserve_aspect_ratio=self.opts.preserve_cover_aspect_ratio)
painter.end()
printer.abort()
def _write(self):

Some files were not shown because too many files have changed in this diff Show More