Merge branch 'kovidgoyal/master'

This commit is contained in:
Charles Haley 2013-06-13 09:16:28 +02:00
commit 051e27bad4
67 changed files with 3220 additions and 1573 deletions

8
.gitattributes vendored
View File

@ -1,13 +1,14 @@
# Tell git what files are txt
*.py text
*.recipe text
*.recipe text diff=python
*.coffee text
*.js text
*.pot text
*.po text
*.html text
*.css text
*.xhtml text
*.htm text
*.css text
*.rst text
*.md text
*.txt text
@ -20,12 +21,13 @@
*.tmpl text
*.qrc text
*.sh text
*.xhtml text
*.fb2 text
*.bat text eol=crlf
# Tell git what files are binary
*.zip binary
*.epub binary
*.docx binary
*.jpg binary
*.jpeg binary
*.png binary

54
.gitignore vendored
View File

@ -39,54 +39,6 @@ nbproject/
.settings/
*.DS_Store
calibre_plugins/
recipes/.git
recipes/.gitignore
recipes/README.md
recipes/icon_checker.py
recipes/readme_updater.py
recipes/garfield.recipe
recipes/katalog_egazeciarz.recipe
recipes/tv_axnscifi.recipe
recipes/tv_comedycentral.recipe
recipes/tv_discoveryscience.recipe
recipes/tv_foxlife.recipe
recipes/tv_fox.recipe
recipes/tv_hbo.recipe
recipes/tv_kinopolska.recipe
recipes/tv_nationalgeographic.recipe
recipes/tv_polsat2.recipe
recipes/tv_polsat.recipe
recipes/tv_tv4.recipe
recipes/tv_tvn7.recipe
recipes/tv_tvn.recipe
recipes/tv_tvp1.recipe
recipes/tv_tvp2.recipe
recipes/tv_tvphd.recipe
recipes/tv_tvphistoria.recipe
recipes/tv_tvpkultura.recipe
recipes/tv_tvppolonia.recipe
recipes/tv_tvpuls.recipe
recipes/tv_viasathistory.recipe
recipes/icons/katalog_egazeciarz.png
recipes/icons/garfield.png
recipes/icons/tv_axnscifi.png
recipes/icons/tv_comedycentral.png
recipes/icons/tv_discoveryscience.png
recipes/icons/tv_foxlife.png
recipes/icons/tv_fox.png
recipes/icons/tv_hbo.png
recipes/icons/tv_kinopolska.png
recipes/icons/tv_nationalgeographic.png
recipes/icons/tv_polsat2.png
recipes/icons/tv_polsat.png
recipes/icons/tv_tv4.png
recipes/icons/tv_tvn7.png
recipes/icons/tv_tvn.png
recipes/icons/tv_tvp1.png
recipes/icons/tv_tvp2.png
recipes/icons/tv_tvphd.png
recipes/icons/tv_tvphistoria.png
recipes/icons/tv_tvpkultura.png
recipes/icons/tv_tvppolonia.png
recipes/icons/tv_tvpuls.png
recipes/icons/tv_viasathistory.png
recipes/*.mobi
recipes/*.epub
recipes/debug

View File

@ -1,4 +1,823 @@
- version: 0.8.69
date: 2012-09-14
new features:
- title: "E-book viewer: Add a button to the toolbar to switch themes easily"
tickets: [1047992]
- title: "When downloading metadata for many books, if some of them fail, add an option to the downloaded message to show the failed books in the main book list, so that they can be individually processed easily"
- title: "Remember last used window size of the conversion dialogs."
tickets: [1049265]
- title: "Kindle driver: Turn on sending of azw3 files to kindles by default, since the KK now has azw3 support"
- title: "Conversion: Add support for CSS pseudo classes :hover, :link, :visited, :first-line, :focus, :active, :first-letter"
- title: "Wireless device driver: Make the default save template not use folders"
bug fixes:
- title: "Fix a regression in th previous release that broke sending of books to the second SD card in SONY readers"
tickets: [1047992]
- title: "Fix a memory leak when scanning for devices in windows"
- title: "Ebook-viewer: When displaying mathematics, reflow equations that dont fit on a single line"
- title: "Catalogs: Do not mark the AZW3 catalog as a periodical, as most Kindle devices cannot handle AZW3 periodicals"
- title: "Content server: When using a custom IP address to listen on via Preferences->Tweaks advertise that IP address via BonJour."
- title: "Fix ebook catalog generation on linux systems where the encoding is not UTF-8."
tickets: [1048404]
improved recipes:
- De Volksrant
- Metro UK
- Countryfile
- Die Zeit (subscription)
- Birmingham post
new recipes:
- title: History Today
author: Rick Shang
- version: 0.8.68
date: 2012-09-07
new features:
- title: "Drivers for the Nokia N9, Viewsonic 7e, Prestigio PER3274B and Coby Kyros 7035 "
tickets: [1046794,1046544]
- title: "Add a tutorial on creating catalogs to the User Manual and a link to it in the create catalogs dialog"
- title: "Wireless device connections: Add an option to force calibre to listen on a particular IP address. Access it by customizing the plugin in Preferences->Plugins"
- title: "Android driver: Add an extra customization option to configure the directory to which ebooks are sent on the storage cards."
tickets: [1045045]
- title: "Add an option under Preferences->Look & Feel->Book Details to hide the cover in the book details panel"
- title: "The Calibre Companion Android app that allows wireless connection of Android device to calibre is out of beta. See https://play.google.com/store/apps/details?id=com.multipie.calibreandroid"
bug fixes:
- title: "Fix sorting by author not working in the device view in calibre when connected to iTunes"
tickets: [1044619]
- title: "Fix using the 'configure this device' menu action not validating settings"
- title: "Device drivers: Ignore corrupted entries in metadata.calibre, instead of raising an error"
- title: "PDF Output: Do not error out when generating an outline which points to pages that have been removed."
tickets: [1044799]
- title: "PDF Output: Fix incorrect page numbers being generated in the outline when converting some books"
- title: "PDF Output: Reduce memory consumption when writing out the PDF file, by using a stream"
- title: "EPUB metadata: When there are multiple <dc:date> tags use the one with the earliest date as the published date"
improved recipes:
- Wall Street journal (subscription version)
- Houston Chronicle
- Various Romanian news sources
- Business Week Magazine
- Arcamax
- version: 0.8.67
date: 2012-08-31
new features:
- title: "PDF Output: Generate a PDF Outline based on the Table of Contents of the input document"
- title: "Conversion: Add an option under Structure Detection to set the 'Start reading at' metadata with an XPath expression."
tickets: [1043233]
- title: "Speed up changing the title and author of files with books larger than 3MB by avoiding an unnecessary extra copy."
- title: "Wireless device driver: Make detecting and connecting to devices easier on networks where mdns is disabled"
- title: "PDF Output: Allow choosing the default font family and size when generating PDF files (under PDF Options) in the conversion dialog"
- title: "Metadata dialog: Comments editor: Allow specifying the name of a link when using the insert link button."
tickets: [1042683]
- title: "Remove the unmaintained pdfmanipulate command line utility. There are many other tools that provide similar functionality, for example, pdftk and podofo"
bug fixes:
- title: "Catalogs: Fix regression that broke sorting of non series titles before series titles"
- title: "PDF Output: Do not create duplicate embedded fonts in the PDF for every individual HTML file in the input document"
- title: "Fix regression that broke DnD of files having a # character in their names to the book details panel"
- title: "PDF Output: Allow generating PDF files with more than 512 pages on windows."
tickets: [1041614]
- title: "Fix minor bug in handling of the completion popups when using the next/previous buttons in the edit metadata dialog"
ticket: [1041389]
improved recipes:
- Coding Horror
- TIME Magazine
new recipes:
- title: Cumhuriyet Yzarlar
author: Sethi Eksi
- title: Arcadia
author: Masahiro Hasegawa
- title: Business Week Magazine and Chronicle of Higher Education
author: Rick Shang
- title: CIPER Chile
author: Darko Miletic
- version: 0.8.66
date: 2012-08-24
new features:
- title: "E-book viewer: Support the display of mathematics in e-books. Supports both embedded TeX and MathML"
description: "The calibre ebook viewer can now display embedded mathematics (symbols, equations, fractions, matrices, etc.) in EPUB and HTML ebooks. For details, see: http://manual.calibre-ebook.com/typesetting_math.html"
type: major
- title: "Drivers for SONY PRS-T2, Freelander PD10 and Coolreader Tablet"
tickets: [1039103]
- title: "Wireless device connections: Use a streamed mode for improved networking performance leading to much less time spent sending metadata to/from the device. Also make it easier to specify a fixed port directly in the dialog used to start the connection."
- title: "Get books: Add ebooksgratuitis.com"
bug fixes:
- title: "PDF Output: Handle input epub documents with filenames starting with a dot. Also do not hang if there is an unhandled error."
tickets: [1040603]
- title: "Get Books: Update B&N plugin to handle changes to the B&N website"
- title: "Content server: Fix regression that caused the port being advertised via BonJour to be incorrect if the user changed the port for the server."
tickets: [1037912]
improved recipes:
- Variety
- The Times UK
new recipes:
- title: Le Monde subscription version
author: Remi Vanicat
- title: Brecha Digital
author: Darko Miletic
- version: 0.8.65
date: 2012-08-17
new features:
- title: "A new wireless device driver. This allows connecting wirelessly to a device running a 'smart' calibre client"
description: "The wireless connection functions just as if the device was plugged into the computer by USB cable. Currently, Android devices are supported. See https://play.google.com/store/apps/details?id=com.multipie.calibreandroid"
type: major
- title: "MOBI Output: Add an option to control the type of MOBI file produced, to the MOBI Output conversion options. You can now generate an old MOBI6, a new KF8 or a joint MOBI6/KF8 file. By default, MOBI6 files are generated. This replaces the previous use of a tweak."
- title: "E-book viewer: Make paged mode the default. You can go back to the old flow mode by clicking the button with the yellow scroll in the top right corner of the viewer."
- title: "Driver for COBY kYROS MID7042 and Samsung Galaxy Ace S5839i"
bug fixes:
- title: "Update version of poppler bundled with calibre to fix reading covers from some PDF files"
- title: "Get Books: Fix clicking of results from Diesel books when there is only a single result not working"
- title: "Improve detection of system language on first run of calibre"
tickets: [1036354]
- title: "When finding the next series index and the last series index is a fractional number, use the next largest integer, instead of just adding 1"
- title: "Fix exception when saving a search/replace when no saved search/replace had been opened previously in the bulk search/replace dialog"
tickets: [1036464]
- title: "Fix restore database not restoring entries for the original_* formats"
- title: "Fix first run wizard not allowing empty email sending settings"
tickets: [1036358]
- title: "Do not error out when setting the cover for a book that has no folders in the library."
tickets: [1035935]
- title: "Conversion pipeline: Ignore unparseable values in the color attribute of font tags, instead of erroring out on them."
tickets: [1035633]
- title: "Catalogs: Fix regression that broke creation of catalogs while a device is connected"
- title: "Fix --with-library=/whatever not working for calibredb list"
improved recipes:
- Slashdot
- Various Canadian newspapers
- Business Spectator
- version: 0.8.64
date: 2012-08-09
new features:
- title: "E-book viewer: Allow viewing images in the book in a separate pop-up window by right clicking on the image. Useful if you want to keep some image, like a map to the side while reading the book."
- title: "Catalogs: Allow generation of catalogs in AZW3 format. Also add more powerful configuration options to exclude books and set prefixes. See http://www.mobileread.com/forums/showthread.php?t=187298 for details."
- title: "Generate a PDF version of the User Manual"
bug fixes:
- title: "News download: Fix broken handling of nesting for HTML 5 tags when parsing with BeautifulSoup"
- title: "EPUB: Handle files in the EPUB that have semi-colons in their file names. This means in particular using URL escaping when creating the NCX as ADE cannot handle unescaped semi-colons in the NCX."
tickets: [1033665]
- title: "Conversion pipeline: Ignore unparseable CSS instead of erroring out on it."
tickets: [1034074]
- title: "When setting up a column coloring rule based on the languages column, allow entry of localized language names instead of only ISO codes"
- title: "Catalogs: Generate cover for mobi/azw3 catalogs"
- title: "Update the last modified column record of a book, whenever a format is added to the book."
- title: "E-book viewer: Fix line scrolling stops at breaks option not working in paged mode"
tickets: [1033430]
- title: "MOBI Output: Fix ToC at start option having no effect when converting some input documents that have an out-of-spine ToC."
tickets: [1033656]
- title: "Catalog Generation: When generating EPUB/MOBI catalogs add more flexible rules for excluding books. Also add rules to customize the prefix characters used."
- title: "Make setting published date using metadata search/replace more robust."
- title: "Tag Browser: Flatten the display of sub-groups when sort by is not set to 'name'."
tickets: [1032746]
- title: "Fix isbn:false not matching if other identifiers are attached to the book."
improved recipes:
- The New Republic
- ZDNet
- Metro UK
- FHM UK
new recipes:
- title: eKundelek.pl
author: Artur Stachecki
- title: Sueddeutsche Mobil
author: Andreas Zeiser
- version: 0.8.63
date: 2012-08-02
new features:
- title: "E-book viewer: Allow quick saving and loading of viewer settings as 'themes'."
tickets: [1024611]
- title: "Ebook-viewer: Add a restore defaults button to the viewer preferences dialog"
- title: "E-book viewer: Add simple settings for text and background colors"
- title: "Add an entry to save to disk when right clicking a format in the book details panel"
- title: "ODT metadata: Read first image as the metadata cover from ODT files. Also allow ODT authors to set custom properties for extended metadata."
- title: "E-book viewer and PDF Output: Resize images that are longer than the page to fit onto a single page"
bug fixes:
- title: "KF8 Output: Fix bug where some calibre generated KF8 files would cause the Amazon KF8 viewer on the Touch to go to into an infinite loop when using the next page function"
tickets: [1026421]
- title: "News download: Add support for <img> tags that link to SVG images."
tickets: [1031553]
- title: "Update podofo to 0.9.1 in all binary builds, to fix corruption of some PDFs when updating metadata."
tickets: [1031086]
- title: "Catalog generation: Handle authors whose last name is a number."
- title: "KF8 Input: Handle html entities in the NCX toc entries correctly"
- title: "Fix a calibre crash that affected some windows installs"
tickets: [1030234]
- title: "MOBI Output: Normalize unicode strings before writing to file, to workaround lack of support for non-normal unicode in Amazon's MOBI renderer."
tickets: [1029825]
- title: "EPUB Input: Handle files that have duplicate entries in the spine"
- title: "Fix regression in Kobo driver that caused the on device column to not be updated after deleting books"
new recipes:
- title: Dziennik Polski
author: Gregorz Maj
- title: High Country Blogs
author: Armin Geller
- title: Philosophy Now
author: Rick Shang
- version: 0.8.62
date: 2012-07-27
new features:
- title: "Book details panel: Allow right clicking on a format to delete it."
- title: "When errors occur in lots of background jobs, add an option to the error message to temporarily suppress subsequent error messages."
tickets: [886904]
- title: "E-book viewer full screen mode: Allow clicking in the left and right page margins to turn pages."
tickets: [1024819]
- title: "Drivers for various Android devices"
tickets: [1028690,1027431]
- title: "Advanced search dialog: When starting on the title/author/etc. tab, restore the previously used search kind as well."
tickets: [1029745]
- title: "When presenting the calibre must be restarted warning after installing a new plugin, add a restart now button so that the user can conveniently restart calibre. Currently only works when going vie Preferences->Plugins->Get new plugins"
bug fixes:
- title: "Fix main window layout state being saved incorrectly if calibre is killed without a proper shutdown"
- title: "Fix boolean and date searching in non english calibre installs."
- title: "Conversion: Ignore invalid chapter detection and level n ToC expressions instead of erroring out"
improved recipes:
- Psychology Today
- The Smithsonian
- The New Republic
- Various updated Polish news sources
- The Sun
- San Francisco Bay Guardian
- AnandTech
- Smashing Magazine
new recipes:
- title: Linux Journal and Conowego.pl
author: fenuks
- title: A list apart and .net magazine
author: Marc Busque
- version: 0.8.61
date: 2012-07-20
new features:
- title: "E-book viewer: Add a paged mode that splits up the text into pages, like in a paper book instead of presenting it as a single column. To activate click the button with the yellow scroll icon in the top right corner."
type: major
description: "In paged mode, the ebook viewer no longer cuts off the last line of text at the bottom of the screen, and it respects CSS page-break directives. You can also set page margins and control the number of pages displayed on screen by clicking the Preferences button in the viewer and going to 'Text layout in paged mode'."
- title: "Digitally sign the calibre OS X and windows builds"
- title: "Get Books: Add Mills and Boon UK"
- title: "Various minor improvements to the Bulk metadata edit dialog"
tickets: [1025825, 1025838, 1025628]
- title: "Fix various regression in the auto-complete functionality for authors/series/tags etc introduced in 0.8.60"
- title: "Drivers for various new Android devices"
tickets: [1024934]
- title: "MOBI: Add support for the new language EXTH header field in MOBI files generated by kindlegen 2.5"
bug fixes:
- title: "KF8 Output: Fix calibre produced KF8 files not showing the 'Use publisher font' option on the Kindle Touch when they have embedded fonts"
- title: "Txt/fb2/rtf/pml/rb output: Fix non-visibile element's tail text (which should be visible) is being ignored when it shouldn't."
tickets: [1026541]
- title: "Book details panel: When displaying a link to amazon, use a country specific name like amazon.fr instead of using amazon.com for all countries"
- title: "Conversion: When splitting on page breaks, ignore page-breaks with values of auto and inherit. "
tickets: [1018875]
- title: "Metadata jacket: Specify foreground in addition to the background color for the title banner so that it remain readable if the user tries to monkey with the CSS in the viewer."
- title: "PDF Output: Fix rendering of cover as first age of PDF (ignore margins so that the image covers the entire page)"
- title: "Linux binaries: Bundle libglib to avoid incompatibilities with glib on various distros."
tickets: [1022019]
- title: "Fix find_identical_books() choking on books with too many authors"
improved recipes:
- Toronto Star
- American Prospect
- faz.net
- version: 0.8.60
date: 2012-07-13
new features:
- title: "When searching, allow use of un-accented characters to match accented characters in all fields and all languages (not just authors and English as before)"
description: "The rules for matching un-accented characters are done in a language dependent way. So if your calibre interface language is set to English, n will match both n and ñ, but if it is set to Spanish, it will match only n, as in Spanish ñ is a separate alphabet in Spanish. This makes searching a little slower, so if you have a very large library you can turn it off via Preferences->Searching."
type: major
- title: "Content server: Show a best guess for the IP address the content server is currently listening at in the connect/share menu."
tickets: [1024128]
- title: "E-book viewer: Add an option to show a clock in full screen mode."
tickets: [1022086]
- title: "Drivers for Paquito Imaginarium and a few Android phones"
tickets: [1024021,1023613,1023461,1022401]
- title: "HTMLZ Output: Add option to use the book title as the filename for the html file inside the archive"
- title: "Make the list of displayed fields in the book details panel a per library setting"
- title: "Have autocomplete on authors/series/tags/etc. ignore accented characters when finding matches (similar to the changes to search above)"
- title: "Support for retina displays in OS X (I hope)"
tickets: [1022191]
- title: "Remove the dependency on the zip command line tool when developing plugins"
bug fixes:
- title: "Kobo driver: Do not perform write operations on the Kobo database if its version is newer than the latest version the driver supports, for safety"
- title: "KF8 Input: Ignore encoding declarations inside the html markup, as they are sometimes incorrect."
tickets: [1022933]
- title: "Force refresh of cached composite column values when values in the cache are changed"
- title: "Fix a regression that broke calibre --shutdown-running-calibre on windows."
tickets: [1022504]
- title: "Possible workaround for Qt 4.8.2 open file dialog failing on some linux distros."
tickets: [1022019]
- title: "Catalogs: Fix some epubcheck errors when generating catalogs in EPUB format"
- title: "Linux installer: When calling the xdg utilities use system libraries rather than the libraries bundled with calibre"
- title: "Fix numeric sort for composite custom columns that use custom separators"
tickets: [1021814]
- title: "Tag browser: When grouping by first letter, handle languages that have 'letters' made of more than one character. This can be turned off via Preferences->Tweaks"
improved recipes:
- Hola magazine
- Adventure Gamers
- Cosmopolitan UK
- Onda Rock
new recipes:
- title: Empire Magazine
author: Dave Asbury
- title: NZZ Folio
author: Bernd Leinfelder
- title: Warentest
author: asdfdsfksd
- version: 0.8.59
date: 2012-07-06
new features:
- title: "Drivers for Samsung SGH-T989 and Sony Ericsson Sola"
tickets: [1021365]
- title: "Conversion pipeline: When removing the first image, also remove the html file the image is found in, if that file has no other content. Allows this option to be used to remove covers from EPUB files without leaving behind a blank page."
- title: "Content server: Add a navigation panel at the bottom of each page."
tickets: [1020225]
- title: "calibredb: Add a backup_metadata command to manually run the backup to opf from the command line"
- title: "User defined driver: Add option to swap main memory and card a."
tickets: [1020056]
- title: "Add new option to the series_index_auto_increment tweak, no_change, that causes calibre not to change the series_index when the series is changed"
bug fixes:
- title: "PDF Output: Resize large images so that they do not get off at the right edge of the page."
- title: "On linux ensure that WM_CLASS for the main calibre GUI is set to 'calibre-gui' to match the name of the calibre-gui.desktop file. This is apparently required by the GNOME 3 shell."
tickets: [1020297]
- title: "Update ICU in all builds to version 49.1"
- title: "Tag browser: Fix regression that broke drag and drop between user categories in the tag browser"
- title: "When copying to library and deleting after copy, do not place deleted files in recycle bin, as this is redundant and slow (they have already been copied into another library)"
- title: "Fix yes/no fields with value of No not showing up in the book details panel"
- title: "Catalogs: Better sorting for non English languages"
tickets: [930882]
- title: "Get Books: Fix Foyles UK, Weightless books, ebooks.com and ozon.ru"
- title: "CHM Input: Fix handling of chm files that split their html into multiple sub-directories."
tickets: [1018792]
improved recipes:
- FHM UK
- The Age
- weblogs_ssl
- Heraldo.es
new recipes:
- title: CATO Institute and Heritage Foundation
author: _reader
- version: 0.8.58
date: 2012-06-29
new features:
- title: "Add some texture to calibre generated covers"
- title: "Drivers for Sogo SS-4370, HTC G2 and Lenovo ThinkPad Tablet"
tickets: [1019050, 1017010]
- title: "Add search to the Manage tags/series/etc. dialogs"
- title: "News download: Add support for images embedded in the HTML"
- title: "calibre -s now waits for calibre to shutdown"
bug fixes:
- title: "Workaround for iTunes breaking scripting with version 10.6.3 on OS X."
tickets: [1012243]
- title: "EPUB Input: When there are multiple elements of the same type in the OPF guide, use the first rather than the last element."
- title: "Windows: Disable the new UI style if the color depth of the desktop is less than 32 bits per pixel"
- title: "ISBNDB metadata plugin: Return results even though they have no comments"
- title: "More robust handling of EINTR during IPC"
- title: "Metadata download: Support for amazon's new results page markup"
- title: "EPUB Output: Fix a bug that could cause corrupted output when doing an EPUB/OEB to EPUB conversion if the input EPUB had multiple files with the same name"
- title: "KF8 Output: Fix a couple of bugs that could lead to generation of invalid KF8 files."
tickets: [1016672]
improved recipes:
- ABC Digital
- O Globo
new recipes:
- title: Sign of the Times and New Statesman
author: TerminalVeracity
- title: CT24
author: zoidozoido
- title: SmileZilla
author: Will
- title: Marketing Sensoriale
author: NotTaken
- version: 0.8.57
date: 2012-06-22
new features:
- title: "PDF Output: Full pagination support. No more cutoff bottom line."
type: major
description: "Fixes a long standing bug in calibre's PDF Output that caused the bottom line of some pages to be partially cut off and prevented top and bottom margins from working."
- title: "calibredb add now prints out the ids of added books"
tickets: [1014303]
- title: "Kobo Vox driver: Add support for new Google Play firmware"
tickets: [1014129]
- title: "Driver for Prestigio PMP5097PRO"
tickets: [1013864]
- title: "Add option to disable tooltips in the book list under Preferences->Look & Feel"
- title: "When customizing builtin recipes download the latest version of the recipe to customize instead of using the possibly out of date bundled version"
bug fixes:
- title: "PDF Output: Use the cover from the input document when no cover is specified during a conversion"
- title: "E-book Viewer: Printing now has proper pagination with top and bottom margins no lines partially cut-off at the bottom and full style retention"
- title: "KF8 Input: Handle files with incorrectly encoded guide type entries."
tickets: [1015020]
- title: "E-book viewer: Disable hyphenation on windows xp as Qt WebKit barfs on soft hyphens on windows XP"
- title: "Handle OS X systems with invalid palette colors."
tickets: [1014900]
- title: "Tag Browser: Fix regression that broke partitioning of hierarchical categories."
tickets: [1014065]
- title: "LRF Output: Handle negative page margins"
tickets: [1014103]
- title: "Template language: Fix arithmetic functions to tolerate the value 'None' as returned by raw_field()"
- title: "Fix custom title sort set in the edit metadata dialog getting reset by the conversion dialog"
improved recipes:
- The Economist
- Akter
- 24 Sata sr
- Novi List
- Metro Montreal
- Mode Durable
- CanardPC
- The Economic Collapse
- Our Daily Bread
new recipes:
- title: Akter Daily
author: Darko MIletic
- title: BBC Brasil
author: Claviola
- title: Homopedia.pl
author: rainbowwarrior
- title: National Geographic Magazine
author: Terminal Veracity
- title: Something Awful
author: atordo
- title: Huffington Post UK
author: Krittika Goyal
- version: 0.8.56
date: 2012-06-15
new features:
- title: "Make the new calibre style default on Windows and OS X."
type: major
description: "This change gives a more 'modern' feel to the calibre user interface with focus highlighting, gradients, rounded corners, etc. In case you prefer the old look, you can restore under Preferences->Look & Feel->User interface style"
- title: "Get Books: Add the new SONY Reader store"
- title: "Read metadata from .docx (Microsoft Word) files"
- title: "Allow customizing the behavior of the searching for similar books by right clicking the book. You can now tell calibre to search different columns than the traditional author/series/publisher/tags/etc. in Preferences->Searching"
- title: "Add option to restore alternating row colors to the Tag Browser under Preferences->Look & Feel->Tag Browser"
- title: "Update to Qt 4.8.2 on windows compiled with link time code generation for a small performance boost"
bug fixes:
- title: "Get Books: Update plugins to handle website changes at ebooks.com, project gutenberg, and virtualo"
- title: "AZW3 Output: Fix TOC at start option not working"
- title: "AZW3 Output: Close self closing script/style/title/head tags explicitly as they cause problems in webkit based renderers like the Kindle Fire and calibre's viewers."
- title: "Fix the current_library_name() template function not updating after a library switch"
- title: "AZW3 Output: Handle the case of a link pointing to the last line of text in the document."
tickets: [1011330]
- title: "Fix regression in 0.8.55 that broke highlighting of items matching a search in the Tag Browser"
tickets: [1011030]
- title: "News download: Handle query only relative URLs"
improved recipes:
- Christian Science Monitor
- Neue Zurcher Zeitung
- Birmignham Post
- Metro UK
- New Musical Express
- The Independent
- The Daily Mirror
- Vreme
- Smithsonian Magazine
new recipes:
- title: NZZ Webpaper
author: Bernd Leinfelder
- version: 0.8.55
date: 2012-06-08
new features:
- title: "Add a new 'Calibre style' interface look that is more modern than the default look. You can select it via Preferences->Look & Feel->User interface style."
- title: "New, subtler look for the Tag Browser"
- title: "Driver for Trekstor Pyrus and Pantech Android Tablet"
tickets: [1008946, 1007929]
- title: "Conversion pipeline: Handle guide elements with incorrectly cased hrefs. Also handle guide elements of type coverimagestandard and thumbimagestandard."
- title: "Allow user to customize trekstor plugin to send books into sub directories."
tickets: [1007646]
- title: "EPUB Input: Add support for EPUB files that use the IDPF font obfuscation algorithm. Apparently, people have started producing these now."
tickets: [1008810]
- title: "Save single format to disk: Only show the format available in the selected books."
tickets: [1007287]
bug fixes:
- title: "MOBI Output: When using the insert metadata at start of book option, do not use a table to layout the metadata, as the Kindle Fire crashes when rendering the table."
tickets: [1002119]
- title: "Device detection: Fix a bug that could cause device detection to fail completely if devices with certain vendor/product ids are connected."
tickets: [1009718]
- title: "MOBI Output: When rasterizing svgs only compute style information when an actual svg image is present. Small speedup when converting large svg-free documents to MOBI."
- title: "SONY T1 driver: Fix support for collections of books placed on the SD card"
tickets: [986044]
- title: "Fix partitioning problems in tag browser with fields that have no name, such as identifiers and formats"
- title: "Welcome wizard: Preferentially use the kindle email address set as default when more than one such address exists."
tickets: [1007932 ]
- title: "Fix regression in 0.8.54 that broke the use of the shortcut Alt+A to select books by the same author"
improved recipes:
- Various Polish recipes
- Vice Magazine
- EL Mundo Today
- Haaretz
- Good Housekeeping
- El Pais
- Christian Science Monitor
- Marketing Magazine
- Instapaper
new recipes:
- title: Various Philippine news sources
author: jde
- title: Natemat.pl and wirtualnemedia.pl
author: fenuks
- title: Rabble.ca
author: timtoo
- version: 0.8.54
date: 2012-05-31
new features:
- title: "E-book viewer: The Table of contents panel now tracks the current position in the book. As you scroll through the book, the entry you are currently on is highlighted."
type: major
description: "To see this feature in action, open the Table of Contents panel in the viewer by clicking the button with three blue lines on it. As you page through the book, the chapter you are reading currently is highlighted in the Table of Contents Panel. Obviously, this will only work if the book you are reading has a Table of Contents. You can also use the Ctrl+PgUp and Ctrl+PgDn keys to quickly skip between chapters."
- title: "calibredb: Allow setting metadata for individual fields with the set_metadata command"
- title: "Make it a little harder to accidentally change the sorting of items in the Tag Browser. Also frees up more vertical space for the Tag Browser itself."
- title: "The calibre user manual is now available in AZW3 format as well as EPUB"
bug fixes:
- title: "Automatic titlecasing: No longer try to capitalize scottish names, as there are too many special cases."
tickets: [775825]
- title: "Never crash when reading metadata from PDF files (reading now always happens in a worker process)"
tickets: [1006452]
- title: "EPUB Input: Do no skip the valid children of an NCX node that has no text/href"
- title: "Archos driver: Detect SD card"
tickets: [1005650]
- title: "When bulk downloading metadata and the user deletes one of the books for which metadata is being downloaded, just ignore it, instead of erroring out"
- title: "When deleting books from the bottom of the booklist, ensure that the bottom book after deleting is selected"
- title: "Fix regression in 0.8.53 that broke sending APNX files to older Kindle devices"
- title: "Use correct text color for selected rows in the list of matches when downloading metadata and showing results in get books."
tickets: [1004568]
improved recipes:
- The Independent
- Welt der Physik
- China Daily
- The Grid
- Prospect Magazine
new recipes:
- title: La gazetta del Mezzogiorno
author: faber1971
- version: 0.8.53
date: 2012-05-25

View File

@ -20,6 +20,53 @@
# new recipes:
# - title:
- version: 0.9.34
date: 2013-06-07
new features:
- title: "Conversion of Microsoft Word documents (.docx files generated by Word 2007 or newer)"
type: major
description: "DOCX files created with Microsoft Word 2007 or newer can now be converted by calibre. The converter has support for lists, tables, images, all types of text formatting, footnotes, endnotes and even dropcaps. A sample docx file showing the capabilities of the converter is available: http://calibre-ebook.com/downloads/demos/demo.docx Note that this code is still very new, so there are more than likely a few bugs waiting to be squashed."
- title: "Kobo driver: Support for the newly released firmware 2.6.1. Also remove empty shelves from the Aura HD home page when deleting books."
tickets: [1187791]
- title: "E-book viewer: Add Keyboard shortcuts for Back and Forward (Alt+Left, Alt+Right)"
tickets: [1186928]
- title: "Allow right clicking on an author in Book Details to manage that author, i.e. change the author name, sort value or link."
tickets: [1186192]
bug fixes:
- title: "Fix regression that broke FB2 input in the previous release."
tickets: [1186213]
- title: "Catalog generation on OS X: Fix handling of some unicode characters"
tickets: [1066922]
- title: "HTML Input: Avoid spurious log warnings about unspecified language/creator when these are actually specified on the command line."
tickets: [1186899]
- title: "MOBI Output: Fix regression in 0.9.31 that caused vertical margins specified on some block level elements to be ignored."
tickets: [1186533]
- title: "ToC Editor: Handle ebooks that have <p> tags inside the <head> tags. Instead of erroring out, the <p> tags are automatically moved into <body>."
tickets: [1186298]
- title: "Linux build: Include the ffi libs from both gcc and libffi."
tickets: [1186148]
- title: "When deleting custom recipes, use recycle bin."
tickets: [1186142]
improved recipes:
- Folha de Sao Paolo
- Metro News NL
new recipes:
- title: Seventh Guard
author: koliberek
- version: 0.9.33
date: 2013-05-31
@ -1878,821 +1925,3 @@
author: drMerry
- version: 0.8.69
date: 2012-09-14
new features:
- title: "E-book viewer: Add a button to the toolbar to switch themes easily"
tickets: [1047992]
- title: "When downloading metadata for many books, if some of them fail, add an option to the downloaded message to show the failed books in the main book list, so that they can be individually processed easily"
- title: "Remember last used window size of the conversion dialogs."
tickets: [1049265]
- title: "Kindle driver: Turn on sending of azw3 files to kindles by default, since the KK now has azw3 support"
- title: "Conversion: Add support for CSS pseudo classes :hover, :link, :visited, :first-line, :focus, :active, :first-letter"
- title: "Wireless device driver: Make the default save template not use folders"
bug fixes:
- title: "Fix a regression in th previous release that broke sending of books to the second SD card in SONY readers"
tickets: [1047992]
- title: "Fix a memory leak when scanning for devices in windows"
- title: "Ebook-viewer: When displaying mathematics, reflow equations that dont fit on a single line"
- title: "Catalogs: Do not mark the AZW3 catalog as a periodical, as most Kindle devices cannot handle AZW3 periodicals"
- title: "Content server: When using a custom IP address to listen on via Preferences->Tweaks advertise that IP address via BonJour."
- title: "Fix ebook catalog generation on linux systems where the encoding is not UTF-8."
tickets: [1048404]
improved recipes:
- De Volksrant
- Metro UK
- Countryfile
- Die Zeit (subscription)
- Birmingham post
new recipes:
- title: History Today
author: Rick Shang
- version: 0.8.68
date: 2012-09-07
new features:
- title: "Drivers for the Nokia N9, Viewsonic 7e, Prestigio PER3274B and Coby Kyros 7035 "
tickets: [1046794,1046544]
- title: "Add a tutorial on creating catalogs to the User Manual and a link to it in the create catalogs dialog"
- title: "Wireless device connections: Add an option to force calibre to listen on a particular IP address. Access it by customizing the plugin in Preferences->Plugins"
- title: "Android driver: Add an extra customization option to configure the directory to which ebooks are sent on the storage cards."
tickets: [1045045]
- title: "Add an option under Preferences->Look & Feel->Book Details to hide the cover in the book details panel"
- title: "The Calibre Companion Android app that allows wireless connection of Android device to calibre is out of beta. See https://play.google.com/store/apps/details?id=com.multipie.calibreandroid"
bug fixes:
- title: "Fix sorting by author not working in the device view in calibre when connected to iTunes"
tickets: [1044619]
- title: "Fix using the 'configure this device' menu action not validating settings"
- title: "Device drivers: Ignore corrupted entries in metadata.calibre, instead of raising an error"
- title: "PDF Output: Do not error out when generating an outline which points to pages that have been removed."
tickets: [1044799]
- title: "PDF Output: Fix incorrect page numbers being generated in the outline when converting some books"
- title: "PDF Output: Reduce memory consumption when writing out the PDF file, by using a stream"
- title: "EPUB metadata: When there are multiple <dc:date> tags use the one with the earliest date as the published date"
improved recipes:
- Wall Street journal (subscription version)
- Houston Chronicle
- Various Romanian news sources
- Business Week Magazine
- Arcamax
- version: 0.8.67
date: 2012-08-31
new features:
- title: "PDF Output: Generate a PDF Outline based on the Table of Contents of the input document"
- title: "Conversion: Add an option under Structure Detection to set the 'Start reading at' metadata with an XPath expression."
tickets: [1043233]
- title: "Speed up changing the title and author of files with books larger than 3MB by avoiding an unnecessary extra copy."
- title: "Wireless device driver: Make detecting and connecting to devices easier on networks where mdns is disabled"
- title: "PDF Output: Allow choosing the default font family and size when generating PDF files (under PDF Options) in the conversion dialog"
- title: "Metadata dialog: Comments editor: Allow specifying the name of a link when using the insert link button."
tickets: [1042683]
- title: "Remove the unmaintained pdfmanipulate command line utility. There are many other tools that provide similar functionality, for example, pdftk and podofo"
bug fixes:
- title: "Catalogs: Fix regression that broke sorting of non series titles before series titles"
- title: "PDF Output: Do not create duplicate embedded fonts in the PDF for every individual HTML file in the input document"
- title: "Fix regression that broke DnD of files having a # character in their names to the book details panel"
- title: "PDF Output: Allow generating PDF files with more than 512 pages on windows."
tickets: [1041614]
- title: "Fix minor bug in handling of the completion popups when using the next/previous buttons in the edit metadata dialog"
ticket: [1041389]
improved recipes:
- Coding Horror
- TIME Magazine
new recipes:
- title: Cumhuriyet Yzarlar
author: Sethi Eksi
- title: Arcadia
author: Masahiro Hasegawa
- title: Business Week Magazine and Chronicle of Higher Education
author: Rick Shang
- title: CIPER Chile
author: Darko Miletic
- version: 0.8.66
date: 2012-08-24
new features:
- title: "E-book viewer: Support the display of mathematics in e-books. Supports both embedded TeX and MathML"
description: "The calibre ebook viewer can now display embedded mathematics (symbols, equations, fractions, matrices, etc.) in EPUB and HTML ebooks. For details, see: http://manual.calibre-ebook.com/typesetting_math.html"
type: major
- title: "Drivers for SONY PRS-T2, Freelander PD10 and Coolreader Tablet"
tickets: [1039103]
- title: "Wireless device connections: Use a streamed mode for improved networking performance leading to much less time spent sending metadata to/from the device. Also make it easier to specify a fixed port directly in the dialog used to start the connection."
- title: "Get books: Add ebooksgratuitis.com"
bug fixes:
- title: "PDF Output: Handle input epub documents with filenames starting with a dot. Also do not hang if there is an unhandled error."
tickets: [1040603]
- title: "Get Books: Update B&N plugin to handle changes to the B&N website"
- title: "Content server: Fix regression that caused the port being advertised via BonJour to be incorrect if the user changed the port for the server."
tickets: [1037912]
improved recipes:
- Variety
- The Times UK
new recipes:
- title: Le Monde subscription version
author: Remi Vanicat
- title: Brecha Digital
author: Darko Miletic
- version: 0.8.65
date: 2012-08-17
new features:
- title: "A new wireless device driver. This allows connecting wirelessly to a device running a 'smart' calibre client"
description: "The wireless connection functions just as if the device was plugged into the computer by USB cable. Currently, Android devices are supported. See https://play.google.com/store/apps/details?id=com.multipie.calibreandroid"
type: major
- title: "MOBI Output: Add an option to control the type of MOBI file produced, to the MOBI Output conversion options. You can now generate an old MOBI6, a new KF8 or a joint MOBI6/KF8 file. By default, MOBI6 files are generated. This replaces the previous use of a tweak."
- title: "E-book viewer: Make paged mode the default. You can go back to the old flow mode by clicking the button with the yellow scroll in the top right corner of the viewer."
- title: "Driver for COBY kYROS MID7042 and Samsung Galaxy Ace S5839i"
bug fixes:
- title: "Update version of poppler bundled with calibre to fix reading covers from some PDF files"
- title: "Get Books: Fix clicking of results from Diesel books when there is only a single result not working"
- title: "Improve detection of system language on first run of calibre"
tickets: [1036354]
- title: "When finding the next series index and the last series index is a fractional number, use the next largest integer, instead of just adding 1"
- title: "Fix exception when saving a search/replace when no saved search/replace had been opened previously in the bulk search/replace dialog"
tickets: [1036464]
- title: "Fix restore database not restoring entries for the original_* formats"
- title: "Fix first run wizard not allowing empty email sending settings"
tickets: [1036358]
- title: "Do not error out when setting the cover for a book that has no folders in the library."
tickets: [1035935]
- title: "Conversion pipeline: Ignore unparseable values in the color attribute of font tags, instead of erroring out on them."
tickets: [1035633]
- title: "Catalogs: Fix regression that broke creation of catalogs while a device is connected"
- title: "Fix --with-library=/whatever not working for calibredb list"
improved recipes:
- Slashdot
- Various Canadian newspapers
- Business Spectator
- version: 0.8.64
date: 2012-08-09
new features:
- title: "E-book viewer: Allow viewing images in the book in a separate pop-up window by right clicking on the image. Useful if you want to keep some image, like a map to the side while reading the book."
- title: "Catalogs: Allow generation of catalogs in AZW3 format. Also add more powerful configuration options to exclude books and set prefixes. See http://www.mobileread.com/forums/showthread.php?t=187298 for details."
- title: "Generate a PDF version of the User Manual"
bug fixes:
- title: "News download: Fix broken handling of nesting for HTML 5 tags when parsing with BeautifulSoup"
- title: "EPUB: Handle files in the EPUB that have semi-colons in their file names. This means in particular using URL escaping when creating the NCX as ADE cannot handle unescaped semi-colons in the NCX."
tickets: [1033665]
- title: "Conversion pipeline: Ignore unparseable CSS instead of erroring out on it."
tickets: [1034074]
- title: "When setting up a column coloring rule based on the languages column, allow entry of localized language names instead of only ISO codes"
- title: "Catalogs: Generate cover for mobi/azw3 catalogs"
- title: "Update the last modified column record of a book, whenever a format is added to the book."
- title: "E-book viewer: Fix line scrolling stops at breaks option not working in paged mode"
tickets: [1033430]
- title: "MOBI Output: Fix ToC at start option having no effect when converting some input documents that have an out-of-spine ToC."
tickets: [1033656]
- title: "Catalog Generation: When generating EPUB/MOBI catalogs add more flexible rules for excluding books. Also add rules to customize the prefix characters used."
- title: "Make setting published date using metadata search/replace more robust."
- title: "Tag Browser: Flatten the display of sub-groups when sort by is not set to 'name'."
tickets: [1032746]
- title: "Fix isbn:false not matching if other identifiers are attached to the book."
improved recipes:
- The New Republic
- ZDNet
- Metro UK
- FHM UK
new recipes:
- title: eKundelek.pl
author: Artur Stachecki
- title: Sueddeutsche Mobil
author: Andreas Zeiser
- version: 0.8.63
date: 2012-08-02
new features:
- title: "E-book viewer: Allow quick saving and loading of viewer settings as 'themes'."
tickets: [1024611]
- title: "Ebook-viewer: Add a restore defaults button to the viewer preferences dialog"
- title: "E-book viewer: Add simple settings for text and background colors"
- title: "Add an entry to save to disk when right clicking a format in the book details panel"
- title: "ODT metadata: Read first image as the metadata cover from ODT files. Also allow ODT authors to set custom properties for extended metadata."
- title: "E-book viewer and PDF Output: Resize images that are longer than the page to fit onto a single page"
bug fixes:
- title: "KF8 Output: Fix bug where some calibre generated KF8 files would cause the Amazon KF8 viewer on the Touch to go to into an infinite loop when using the next page function"
tickets: [1026421]
- title: "News download: Add support for <img> tags that link to SVG images."
tickets: [1031553]
- title: "Update podofo to 0.9.1 in all binary builds, to fix corruption of some PDFs when updating metadata."
tickets: [1031086]
- title: "Catalog generation: Handle authors whose last name is a number."
- title: "KF8 Input: Handle html entities in the NCX toc entries correctly"
- title: "Fix a calibre crash that affected some windows installs"
tickets: [1030234]
- title: "MOBI Output: Normalize unicode strings before writing to file, to workaround lack of support for non-normal unicode in Amazon's MOBI renderer."
tickets: [1029825]
- title: "EPUB Input: Handle files that have duplicate entries in the spine"
- title: "Fix regression in Kobo driver that caused the on device column to not be updated after deleting books"
new recipes:
- title: Dziennik Polski
author: Gregorz Maj
- title: High Country Blogs
author: Armin Geller
- title: Philosophy Now
author: Rick Shang
- version: 0.8.62
date: 2012-07-27
new features:
- title: "Book details panel: Allow right clicking on a format to delete it."
- title: "When errors occur in lots of background jobs, add an option to the error message to temporarily suppress subsequent error messages."
tickets: [886904]
- title: "E-book viewer full screen mode: Allow clicking in the left and right page margins to turn pages."
tickets: [1024819]
- title: "Drivers for various Android devices"
tickets: [1028690,1027431]
- title: "Advanced search dialog: When starting on the title/author/etc. tab, restore the previously used search kind as well."
tickets: [1029745]
- title: "When presenting the calibre must be restarted warning after installing a new plugin, add a restart now button so that the user can conveniently restart calibre. Currently only works when going vie Preferences->Plugins->Get new plugins"
bug fixes:
- title: "Fix main window layout state being saved incorrectly if calibre is killed without a proper shutdown"
- title: "Fix boolean and date searching in non english calibre installs."
- title: "Conversion: Ignore invalid chapter detection and level n ToC expressions instead of erroring out"
improved recipes:
- Psychology Today
- The Smithsonian
- The New Republic
- Various updated Polish news sources
- The Sun
- San Francisco Bay Guardian
- AnandTech
- Smashing Magazine
new recipes:
- title: Linux Journal and Conowego.pl
author: fenuks
- title: A list apart and .net magazine
author: Marc Busque
- version: 0.8.61
date: 2012-07-20
new features:
- title: "E-book viewer: Add a paged mode that splits up the text into pages, like in a paper book instead of presenting it as a single column. To activate click the button with the yellow scroll icon in the top right corner."
type: major
description: "In paged mode, the ebook viewer no longer cuts off the last line of text at the bottom of the screen, and it respects CSS page-break directives. You can also set page margins and control the number of pages displayed on screen by clicking the Preferences button in the viewer and going to 'Text layout in paged mode'."
- title: "Digitally sign the calibre OS X and windows builds"
- title: "Get Books: Add Mills and Boon UK"
- title: "Various minor improvements to the Bulk metadata edit dialog"
tickets: [1025825, 1025838, 1025628]
- title: "Fix various regression in the auto-complete functionality for authors/series/tags etc introduced in 0.8.60"
- title: "Drivers for various new Android devices"
tickets: [1024934]
- title: "MOBI: Add support for the new language EXTH header field in MOBI files generated by kindlegen 2.5"
bug fixes:
- title: "KF8 Output: Fix calibre produced KF8 files not showing the 'Use publisher font' option on the Kindle Touch when they have embedded fonts"
- title: "Txt/fb2/rtf/pml/rb output: Fix non-visibile element's tail text (which should be visible) is being ignored when it shouldn't."
tickets: [1026541]
- title: "Book details panel: When displaying a link to amazon, use a country specific name like amazon.fr instead of using amazon.com for all countries"
- title: "Conversion: When splitting on page breaks, ignore page-breaks with values of auto and inherit. "
tickets: [1018875]
- title: "Metadata jacket: Specify foreground in addition to the background color for the title banner so that it remain readable if the user tries to monkey with the CSS in the viewer."
- title: "PDF Output: Fix rendering of cover as first age of PDF (ignore margins so that the image covers the entire page)"
- title: "Linux binaries: Bundle libglib to avoid incompatibilities with glib on various distros."
tickets: [1022019]
- title: "Fix find_identical_books() choking on books with too many authors"
improved recipes:
- Toronto Star
- American Prospect
- faz.net
- version: 0.8.60
date: 2012-07-13
new features:
- title: "When searching, allow use of un-accented characters to match accented characters in all fields and all languages (not just authors and English as before)"
description: "The rules for matching un-accented characters are done in a language dependent way. So if your calibre interface language is set to English, n will match both n and ñ, but if it is set to Spanish, it will match only n, as in Spanish ñ is a separate alphabet in Spanish. This makes searching a little slower, so if you have a very large library you can turn it off via Preferences->Searching."
type: major
- title: "Content server: Show a best guess for the IP address the content server is currently listening at in the connect/share menu."
tickets: [1024128]
- title: "E-book viewer: Add an option to show a clock in full screen mode."
tickets: [1022086]
- title: "Drivers for Paquito Imaginarium and a few Android phones"
tickets: [1024021,1023613,1023461,1022401]
- title: "HTMLZ Output: Add option to use the book title as the filename for the html file inside the archive"
- title: "Make the list of displayed fields in the book details panel a per library setting"
- title: "Have autocomplete on authors/series/tags/etc. ignore accented characters when finding matches (similar to the changes to search above)"
- title: "Support for retina displays in OS X (I hope)"
tickets: [1022191]
- title: "Remove the dependency on the zip command line tool when developing plugins"
bug fixes:
- title: "Kobo driver: Do not perform write operations on the Kobo database if its version is newer than the latest version the driver supports, for safety"
- title: "KF8 Input: Ignore encoding declarations inside the html markup, as they are sometimes incorrect."
tickets: [1022933]
- title: "Force refresh of cached composite column values when values in the cache are changed"
- title: "Fix a regression that broke calibre --shutdown-running-calibre on windows."
tickets: [1022504]
- title: "Possible workaround for Qt 4.8.2 open file dialog failing on some linux distros."
tickets: [1022019]
- title: "Catalogs: Fix some epubcheck errors when generating catalogs in EPUB format"
- title: "Linux installer: When calling the xdg utilities use system libraries rather than the libraries bundled with calibre"
- title: "Fix numeric sort for composite custom columns that use custom separators"
tickets: [1021814]
- title: "Tag browser: When grouping by first letter, handle languages that have 'letters' made of more than one character. This can be turned off via Preferences->Tweaks"
improved recipes:
- Hola magazine
- Adventure Gamers
- Cosmopolitan UK
- Onda Rock
new recipes:
- title: Empire Magazine
author: Dave Asbury
- title: NZZ Folio
author: Bernd Leinfelder
- title: Warentest
author: asdfdsfksd
- version: 0.8.59
date: 2012-07-06
new features:
- title: "Drivers for Samsung SGH-T989 and Sony Ericsson Sola"
tickets: [1021365]
- title: "Conversion pipeline: When removing the first image, also remove the html file the image is found in, if that file has no other content. Allows this option to be used to remove covers from EPUB files without leaving behind a blank page."
- title: "Content server: Add a navigation panel at the bottom of each page."
tickets: [1020225]
- title: "calibredb: Add a backup_metadata command to manually run the backup to opf from the command line"
- title: "User defined driver: Add option to swap main memory and card a."
tickets: [1020056]
- title: "Add new option to the series_index_auto_increment tweak, no_change, that causes calibre not to change the series_index when the series is changed"
bug fixes:
- title: "PDF Output: Resize large images so that they do not get off at the right edge of the page."
- title: "On linux ensure that WM_CLASS for the main calibre GUI is set to 'calibre-gui' to match the name of the calibre-gui.desktop file. This is apparently required by the GNOME 3 shell."
tickets: [1020297]
- title: "Update ICU in all builds to version 49.1"
- title: "Tag browser: Fix regression that broke drag and drop between user categories in the tag browser"
- title: "When copying to library and deleting after copy, do not place deleted files in recycle bin, as this is redundant and slow (they have already been copied into another library)"
- title: "Fix yes/no fields with value of No not showing up in the book details panel"
- title: "Catalogs: Better sorting for non English languages"
tickets: [930882]
- title: "Get Books: Fix Foyles UK, Weightless books, ebooks.com and ozon.ru"
- title: "CHM Input: Fix handling of chm files that split their html into multiple sub-directories."
tickets: [1018792]
improved recipes:
- FHM UK
- The Age
- weblogs_ssl
- Heraldo.es
new recipes:
- title: CATO Institute and Heritage Foundation
author: _reader
- version: 0.8.58
date: 2012-06-29
new features:
- title: "Add some texture to calibre generated covers"
- title: "Drivers for Sogo SS-4370, HTC G2 and Lenovo ThinkPad Tablet"
tickets: [1019050, 1017010]
- title: "Add search to the Manage tags/series/etc. dialogs"
- title: "News download: Add support for images embedded in the HTML"
- title: "calibre -s now waits for calibre to shutdown"
bug fixes:
- title: "Workaround for iTunes breaking scripting with version 10.6.3 on OS X."
tickets: [1012243]
- title: "EPUB Input: When there are multiple elements of the same type in the OPF guide, use the first rather than the last element."
- title: "Windows: Disable the new UI style if the color depth of the desktop is less than 32 bits per pixel"
- title: "ISBNDB metadata plugin: Return results even though they have no comments"
- title: "More robust handling of EINTR during IPC"
- title: "Metadata download: Support for amazon's new results page markup"
- title: "EPUB Output: Fix a bug that could cause corrupted output when doing an EPUB/OEB to EPUB conversion if the input EPUB had multiple files with the same name"
- title: "KF8 Output: Fix a couple of bugs that could lead to generation of invalid KF8 files."
tickets: [1016672]
improved recipes:
- ABC Digital
- O Globo
new recipes:
- title: Sign of the Times and New Statesman
author: TerminalVeracity
- title: CT24
author: zoidozoido
- title: SmileZilla
author: Will
- title: Marketing Sensoriale
author: NotTaken
- version: 0.8.57
date: 2012-06-22
new features:
- title: "PDF Output: Full pagination support. No more cutoff bottom line."
type: major
description: "Fixes a long standing bug in calibre's PDF Output that caused the bottom line of some pages to be partially cut off and prevented top and bottom margins from working."
- title: "calibredb add now prints out the ids of added books"
tickets: [1014303]
- title: "Kobo Vox driver: Add support for new Google Play firmware"
tickets: [1014129]
- title: "Driver for Prestigio PMP5097PRO"
tickets: [1013864]
- title: "Add option to disable tooltips in the book list under Preferences->Look & Feel"
- title: "When customizing builtin recipes download the latest version of the recipe to customize instead of using the possibly out of date bundled version"
bug fixes:
- title: "PDF Output: Use the cover from the input document when no cover is specified during a conversion"
- title: "E-book Viewer: Printing now has proper pagination with top and bottom margins no lines partially cut-off at the bottom and full style retention"
- title: "KF8 Input: Handle files with incorrectly encoded guide type entries."
tickets: [1015020]
- title: "E-book viewer: Disable hyphenation on windows xp as Qt WebKit barfs on soft hyphens on windows XP"
- title: "Handle OS X systems with invalid palette colors."
tickets: [1014900]
- title: "Tag Browser: Fix regression that broke partitioning of hierarchical categories."
tickets: [1014065]
- title: "LRF Output: Handle negative page margins"
tickets: [1014103]
- title: "Template language: Fix arithmetic functions to tolerate the value 'None' as returned by raw_field()"
- title: "Fix custom title sort set in the edit metadata dialog getting reset by the conversion dialog"
improved recipes:
- The Economist
- Akter
- 24 Sata sr
- Novi List
- Metro Montreal
- Mode Durable
- CanardPC
- The Economic Collapse
- Our Daily Bread
new recipes:
- title: Akter Daily
author: Darko MIletic
- title: BBC Brasil
author: Claviola
- title: Homopedia.pl
author: rainbowwarrior
- title: National Geographic Magazine
author: Terminal Veracity
- title: Something Awful
author: atordo
- title: Huffington Post UK
author: Krittika Goyal
- version: 0.8.56
date: 2012-06-15
new features:
- title: "Make the new calibre style default on Windows and OS X."
type: major
description: "This change gives a more 'modern' feel to the calibre user interface with focus highlighting, gradients, rounded corners, etc. In case you prefer the old look, you can restore under Preferences->Look & Feel->User interface style"
- title: "Get Books: Add the new SONY Reader store"
- title: "Read metadata from .docx (Microsoft Word) files"
- title: "Allow customizing the behavior of the searching for similar books by right clicking the book. You can now tell calibre to search different columns than the traditional author/series/publisher/tags/etc. in Preferences->Searching"
- title: "Add option to restore alternating row colors to the Tag Browser under Preferences->Look & Feel->Tag Browser"
- title: "Update to Qt 4.8.2 on windows compiled with link time code generation for a small performance boost"
bug fixes:
- title: "Get Books: Update plugins to handle website changes at ebooks.com, project gutenberg, and virtualo"
- title: "AZW3 Output: Fix TOC at start option not working"
- title: "AZW3 Output: Close self closing script/style/title/head tags explicitly as they cause problems in webkit based renderers like the Kindle Fire and calibre's viewers."
- title: "Fix the current_library_name() template function not updating after a library switch"
- title: "AZW3 Output: Handle the case of a link pointing to the last line of text in the document."
tickets: [1011330]
- title: "Fix regression in 0.8.55 that broke highlighting of items matching a search in the Tag Browser"
tickets: [1011030]
- title: "News download: Handle query only relative URLs"
improved recipes:
- Christian Science Monitor
- Neue Zurcher Zeitung
- Birmignham Post
- Metro UK
- New Musical Express
- The Independent
- The Daily Mirror
- Vreme
- Smithsonian Magazine
new recipes:
- title: NZZ Webpaper
author: Bernd Leinfelder
- version: 0.8.55
date: 2012-06-08
new features:
- title: "Add a new 'Calibre style' interface look that is more modern than the default look. You can select it via Preferences->Look & Feel->User interface style."
- title: "New, subtler look for the Tag Browser"
- title: "Driver for Trekstor Pyrus and Pantech Android Tablet"
tickets: [1008946, 1007929]
- title: "Conversion pipeline: Handle guide elements with incorrectly cased hrefs. Also handle guide elements of type coverimagestandard and thumbimagestandard."
- title: "Allow user to customize trekstor plugin to send books into sub directories."
tickets: [1007646]
- title: "EPUB Input: Add support for EPUB files that use the IDPF font obfuscation algorithm. Apparently, people have started producing these now."
tickets: [1008810]
- title: "Save single format to disk: Only show the format available in the selected books."
tickets: [1007287]
bug fixes:
- title: "MOBI Output: When using the insert metadata at start of book option, do not use a table to layout the metadata, as the Kindle Fire crashes when rendering the table."
tickets: [1002119]
- title: "Device detection: Fix a bug that could cause device detection to fail completely if devices with certain vendor/product ids are connected."
tickets: [1009718]
- title: "MOBI Output: When rasterizing svgs only compute style information when an actual svg image is present. Small speedup when converting large svg-free documents to MOBI."
- title: "SONY T1 driver: Fix support for collections of books placed on the SD card"
tickets: [986044]
- title: "Fix partitioning problems in tag browser with fields that have no name, such as identifiers and formats"
- title: "Welcome wizard: Preferentially use the kindle email address set as default when more than one such address exists."
tickets: [1007932 ]
- title: "Fix regression in 0.8.54 that broke the use of the shortcut Alt+A to select books by the same author"
improved recipes:
- Various Polish recipes
- Vice Magazine
- EL Mundo Today
- Haaretz
- Good Housekeeping
- El Pais
- Christian Science Monitor
- Marketing Magazine
- Instapaper
new recipes:
- title: Various Philippine news sources
author: jde
- title: Natemat.pl and wirtualnemedia.pl
author: fenuks
- title: Rabble.ca
author: timtoo
- version: 0.8.54
date: 2012-05-31
new features:
- title: "E-book viewer: The Table of contents panel now tracks the current position in the book. As you scroll through the book, the entry you are currently on is highlighted."
type: major
description: "To see this feature in action, open the Table of Contents panel in the viewer by clicking the button with three blue lines on it. As you page through the book, the chapter you are reading currently is highlighted in the Table of Contents Panel. Obviously, this will only work if the book you are reading has a Table of Contents. You can also use the Ctrl+PgUp and Ctrl+PgDn keys to quickly skip between chapters."
- title: "calibredb: Allow setting metadata for individual fields with the set_metadata command"
- title: "Make it a little harder to accidentally change the sorting of items in the Tag Browser. Also frees up more vertical space for the Tag Browser itself."
- title: "The calibre user manual is now available in AZW3 format as well as EPUB"
bug fixes:
- title: "Automatic titlecasing: No longer try to capitalize scottish names, as there are too many special cases."
tickets: [775825]
- title: "Never crash when reading metadata from PDF files (reading now always happens in a worker process)"
tickets: [1006452]
- title: "EPUB Input: Do no skip the valid children of an NCX node that has no text/href"
- title: "Archos driver: Detect SD card"
tickets: [1005650]
- title: "When bulk downloading metadata and the user deletes one of the books for which metadata is being downloaded, just ignore it, instead of erroring out"
- title: "When deleting books from the bottom of the booklist, ensure that the bottom book after deleting is selected"
- title: "Fix regression in 0.8.53 that broke sending APNX files to older Kindle devices"
- title: "Use correct text color for selected rows in the list of matches when downloading metadata and showing results in get books."
tickets: [1004568]
improved recipes:
- The Independent
- Welt der Physik
- China Daily
- The Grid
- Prospect Magazine
new recipes:
- title: La gazetta del Mezzogiorno
author: faber1971

View File

@ -574,28 +574,33 @@ format, whether input or output are available in the conversion dialog under the
Convert Microsoft Word documents
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|app| does not directly convert .doc/.docx files from Microsoft Word. However, in Word, you can save the document
as HTML and then convert the resulting HTML file with |app|. When saving as HTML, be sure to use the
"Save as Web Page, Filtered" option as this will produce clean HTML that will convert well. Note that Word
produces really messy HTML, converting it can take a long time, so be patient. Another alternative is to
use the free OpenOffice. Open your .doc file in OpenOffice and save it in OpenOffice's format .odt. |app| can
directly convert .odt files.
|app| can automatically convert ``.docx`` files created by Microsoft Word 2007 and
newer. Just add the file to |app| and click convert (make sure you are running
the latest version of |app| as support for ``.docx`` files is very new).
There is a Word macro package that can automate the conversion of Word documents using |app|. It also makes
generating the Table of Contents much simpler. It is called BookCreator and is available for free
at `mobileread <http://www.mobileread.com/forums/showthread.php?t=28313>`_.
.. note::
There is a `demo .docx file <http://calibre-ebook.com/downloads/demos/demo.docx>`_
that demonstrates the capabilities of the |app| conversion engine. Just
download it and convert it to EPUB or AZW3 to see what |app| can do.
An easy way to generate a Table of Contents when converting a Word document is:
|app| will automatically generate a Table of Contents based on headings if you mark
your headings with the ``Heading 1``, ``Heading 2``, etc. styles in Word. Open
the output ebook in the calibre viewer and click the Table of Contents button
to view the generated Table of Contents.
1. Mark your Chapters and sub-Chapters in the doc file with one of the MS built-in styles called 'Heading 1', 'Heading 2', ..., 'Heading 6'. 'Heading 1' equates to the HTML tag <h1>, 'Heading 2' to <h2> etc
Older .doc files
^^^^^^^^^^^^^^^^^
2. Save the doc as Webpage-filtered (rather than Webpage) and import the html file into |app|
3. When you convert in |app| you use what you did in step 1 to set the box called 'Detect chapters at' on the Convert - Structure Detection page. For example:
* If you mark Chapters with style 'Heading 2' then set the 'Detect chapters at' box to //h:h2 This will give you a proper external metadata TOC in the converted epub.
* A slightly more complex example...if your book has Sections and Chapters and you want a 2-level nested metadata TOC. Mark the doc Sections with style 'Heading 2' and the Chapters with style 'Heading 3'. When you convert set the 'Detect chapters at' box to //h:h2|//h:h3. On the Convert - TOC page set the 'Level 1 TOC' box to //h:h2 and the 'Level 2 TOC' box to //h:h3.
For older .doc files, you can save the document as HTML with Microsoft Word
and then convert the resulting HTML file with |app|. When saving as
HTML, be sure to use the "Save as Web Page, Filtered" option as this will
produce clean HTML that will convert well. Note that Word produces really messy
HTML, converting it can take a long time, so be patient. If you have a newer
version of Word available, you can directly save it as docx as well.
Another alternative is to use the free OpenOffice. Open your .doc file in
OpenOffice and save it in OpenOffice's format .odt. |app| can directly convert
.odt files.
Convert TXT documents
~~~~~~~~~~~~~~~~~~~~~~

View File

@ -115,16 +115,27 @@ commits::
Be careful to not include merges when using ``HEAD~n``.
If you plan to do a lot of development on |app|, then the best method is to create a
`GitHub <http://github.com>`_ account. Once you have an account, follow the
steps at `Setup Git <https://help.github.com/articles/set-up-git>`_ and
`Fork A Repo <https://help.github.com/articles/fork-a-repo>`_ to create your own fork of the
`calibre GitHub repository <https://github.com/kovidgoyal/calibre>`_. Read
`Pushing to a remote <https://help.github.com/articles/pushing-to-a-remote>`_
to learn how to upload your commits to GitHub.
`GitHub <http://github.com>`_ account. Below is a basic guide to setting up
your own fork of calibre in a way that will allow you to submit pull requests
for inclusion into the main |app| repository:
* Setup git on your machine as described in this article: `Setup Git <https://help.github.com/articles/set-up-git>`_
* Setup ssh keys for authentication to GitHub, as described here: `Generating SSH keys <https://help.github.com/articles/generating-ssh-keys>`_
* Go to https://github.com/kovidgoyal/calibre and click the :guilabel:`Fork` button.
* In a Terminal do::
git clone git@github.com:<username>/calibre.git
Replace <username> above with your github username. That will get your fork checked out locally.
* You can make changes and commit them whenever you like. When you are ready to have your work merged, do a::
git push
and go to ``https://github.com/<username>/calibre`` and click the :guilabel:`Pull Request` button to generate a pull request that can be merged.
* You can update your local copy with code from the main repo at any time by doing::
git pull upstream
You can contribute your code in the form of `Pull Requests
<https://help.github.com/articles/using-pull-requests>`_. Generally, you should
create a new branch for any feature that is non-trivial.
You should also keep an eye on the |app| `development forum
<http://www.mobileread.com/forums/forumdisplay.php?f=240>`_. Before making
@ -297,10 +308,14 @@ code, with access to the |app| modules::
is great for testing a little snippet of code on the command line. It works in the same way as the -c switch to the python interpreter::
calibre-debug -e myscript.py
calibre-debug myscript.py
can be used to execute your own Python script. It works in the same way as passing the script to the Python interpreter, except
that the calibre environment is fully initialized, so you can use all the calibre code in your script.
that the calibre environment is fully initialized, so you can use all the calibre code in your script. To use command line arguments with your script, use the form::
calibre-debug myscript.py -- --option1 arg1
The ``--`` causes all subsequent arguments to be passed to your script.
Using |app| in your projects
@ -313,7 +328,7 @@ Binary install of |app|
If you have a binary install of |app|, you can use the Python interpreter bundled with |app|, like this::
calibre-debug -e /path/to/your/python/script.py
calibre-debug /path/to/your/python/script.py -- arguments to your script
Source install on Linux
^^^^^^^^^^^^^^^^^^^^^^^^^^

View File

@ -20,7 +20,7 @@ What formats does |app| support conversion to/from?
|app| supports the conversion of many input formats to many output formats.
It can convert every input format in the following list, to every output format.
*Input Formats:* CBZ, CBR, CBC, CHM, DJVU, EPUB, FB2, HTML, HTMLZ, LIT, LRF, MOBI, ODT, PDF, PRC, PDB, PML, RB, RTF, SNB, TCR, TXT, TXTZ
*Input Formats:* CBZ, CBR, CBC, CHM, DJVU, DOCX, EPUB, FB2, HTML, HTMLZ, LIT, LRF, MOBI, ODT, PDF, PRC, PDB, PML, RB, RTF, SNB, TCR, TXT, TXTZ
*Output Formats:* AZW3, EPUB, FB2, OEB, LIT, LRF, MOBI, HTMLZ, PDB, PML, RB, PDF, RTF, SNB, TCR, TXT, TXTZ
@ -29,13 +29,14 @@ It can convert every input format in the following list, to every output format.
PRC is a generic format, |app| supports PRC files with TextRead and MOBIBook headers.
PDB is also a generic format. |app| supports eReder, Plucker, PML and zTxt PDB files.
DJVU support is only for converting DJVU files that contain embedded text. These are typically generated by OCR software.
MOBI books can be of two types Mobi6 and KF8. |app| fully supports both. MOBI files often have .azw or .azw3 file extensions
MOBI books can be of two types Mobi6 and KF8. |app| fully supports both. MOBI files often have .azw or .azw3 file extensions.
DOCX files from Microsoft Word 2007 and newer are supported.
.. _best-source-formats:
What are the best source formats to convert?
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
In order of decreasing preference: LIT, MOBI, AZW, EPUB, AZW3, FB2, HTML, PRC, RTF, PDB, TXT, PDF
In order of decreasing preference: LIT, MOBI, AZW, EPUB, AZW3, FB2, DOCX, HTML, PRC, ODT, RTF, PDB, TXT, PDF
I converted a PDF file, but the result has various problems?
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

View File

@ -6,30 +6,37 @@ import datetime
class FSP(BasicNewsRecipe):
title = u'Folha de S\xE3o Paulo'
__author__ = 'fluzao'
__author__ = 'Joao Eduardo Bertacchi'
description = u'Printed edition contents. UOL subscription required (Folha subscription currently not supported).' + \
u' [Conte\xfado completo da edi\xe7\xe3o impressa. Somente para assinantes UOL.]'
# found this to be the easiest place to find the index page (13-Nov-2011).
#found this to be the easiest place to find the index page (13-Nov-2011).
# searching for the "Indice Geral" link
HOMEPAGE = 'http://www1.folha.uol.com.br/fsp/'
today=datetime.date.today()
FIRSTPAGE= 'cp' + str(today.day).zfill(2) + str(today.month).zfill(2) + str(today.year) + '.shtml'
masthead_url = 'http://f.i.uol.com.br/fsp/furniture/images/lgo-fsp-430x50-ffffff.gif'
language = 'pt_BR'
no_stylesheets = True
max_articles_per_feed = 40
max_articles_per_feed = 50
remove_javascript = True
needs_subscription = True
remove_tags_before = dict(name='p')
remove_tags = [dict(name='td', attrs={'align':'center'})]
# remove_tags_before = dict(name='p')
# remove_tags_before = dict(name='div', id='articleNew')
# remove_tags_after = dict(name='div', id='articleNew')
keep_only_tags = [dict(name='div', id='articleNew'), dict(name='table', attrs={'class':'articleGraphic'})]
publication_type = 'newspaper'
simultaneous_downloads = 5
# remove_tags = [dict(name='td', attrs={'align':'center'})]
remove_attributes = ['height','width']
# fixes the problem with the section names
section_dict = {'cotidian' : 'cotidiano', 'ilustrad': 'ilustrada',
'quadrin': 'quadrinhos' , 'opiniao' : u'opini\xE3o',
'ciencia' : u'ci\xeancia' , 'saude' : u'sa\xfade',
'ribeirao' : u'ribeir\xE3o' , 'equilibrio' : u'equil\xedbrio',
'imoveis' : u'im\xf3veis', 'negocios' : u'neg\xf3cios',
section_dict = {'cotidian' : 'cotidiano', 'ilustrad': 'ilustrada', \
'quadrin': 'quadrinhos' , 'opiniao' : u'opini\xE3o', \
'ciencia' : u'ci\xeancia' , 'saude' : u'sa\xfade', \
'ribeirao' : u'ribeir\xE3o' , 'equilibrio' : u'equil\xedbrio', \
'imoveis' : u'im\xf3veis', 'negocios' : u'neg\xf3cios', \
'veiculos' : u've\xedculos', 'corrida' : 'folha corrida'}
# this solves the problem with truncated content in Kindle
@ -39,6 +46,40 @@ class FSP(BasicNewsRecipe):
# Indice e Comunicar Erros
preprocess_regexps = [(re.compile(r'<!--/NOTICIA-->.*Comunicar Erros</a>',
re.DOTALL|re.IGNORECASE), lambda match: r'')]
extra_css = """
#articleNew { font: 18px Times New Roman,verdana,arial; }
img { background: none !important; float: none; margin: 0px; }
.newstexts { list-style-type: none; height: 20px; margin: 15px 0 10px 0; }
.newstexts.last { border-top: 1px solid #ccc; margin: 5px 0 15px 0; padding-top: 15px; }
.newstexts li { display: inline; padding: 0 5px; }
.newstexts li.prev { float: left; }
.newstexts li.next { float: right; }
.newstexts li span { width: 12px; height: 15px; display: inline-block; }
.newstexts li.prev span { background-position: -818px -46px; }
.newstexts li.next span { background-position: -832px -46px; }
.newstexts li a { font: bold 12px arial, verdana, sans-serif; text-transform: uppercase; color: #999; text-decoration: none !important; }
.newstexts li a:hover { text-decoration: underline !important }
.headerart { font-weight: bold; }
.title { font: bold 39px Times New Roman,verdana,arial; margin-bottom: 15px; margin-top: 10px; }
.creditart, .origin { font: bold 12px arial, verdana, sans-serif; color: #999; margin: 0px; display: block; }
.headerart p, .fine_line p { margin: 0 !important; }
.fine_line { font: bold 18px Times New Roman,verdana,arial; }
.fine_line p { margin-bottom: 18px !important; }
.fine_line p:first-child { font-weight: normal; font-style: italic; font-size: 20px !important; }
.eye { display: block; width: 317px; border-top: 2px solid #666; padding: 7px 0 7px; border-bottom: 2px solid #666; font-style: italic; font-weight: bold; }
.kicker { font-weight: bold; text-transform: uppercase; font-size: 18px; font-family: Times New Roman,verdana,arial !important; }
.blue { color: #000080; }
.red { color: #F00; }
.blue { color: #000080; }
.green { color: #006400; }
.orange { color: #FFA042; }
.violet { color: #8A2BE2; }
.text_footer { font-size: 15px; }
.title_end { font-size: 23px; font-weight: bold; }
.divisor { text-indent: -9999px; border-bottom: 1px solid #ccc; height: 1px; margin: 0; }
.star { background: none !important; height: 15px; }
.articleGraphic { margin-bottom: 20px; }
"""
def get_browser(self):
br = BasicNewsRecipe.get_browser(self)
@ -48,23 +89,33 @@ class FSP(BasicNewsRecipe):
br['user'] = self.username
br['pass'] = self.password
br.submit().read()
# if 'Please try again' in raw:
# raise Exception('Your username and password are incorrect')
## if 'Please try again' in raw:
## raise Exception('Your username and password are incorrect')
return br
# def postprocess_html(self, soup, first_fetch):
# #Clean-up normal articles
# tags = soup.findAll('div', id='articleNew')
# if tags and tags[0]:
# return tags[0]
# #Clean-up first page
# tags = soup.findAll('div', attrs={'class':'double_column facsimile'})
# if tags and tags[0]:
# return tags[0]
# return soup
def parse_index(self):
# Searching for the index page on the HOMEPAGE
# hpsoup = self.index_to_soup(self.HOMEPAGE)
# indexref = hpsoup.find('a', href=re.compile('^indices.*'))
# self.log('--> tag containing the today s index: ', indexref)
# INDEX = indexref['href']
# INDEX = 'http://www1.folha.uol.com.br/'+INDEX
today=datetime.date.today()
INDEX = 'http://www1.folha.uol.com.br/' + 'fsp/indices/index-' + str(today).replace('-','') + '.shtml'
#Searching for the index page on the HOMEPAGE
hpsoup = self.index_to_soup(self.HOMEPAGE)
#indexref = hpsoup.find('a', href=re.compile('^indices.*'))
#self.log('--> tag containing the today s index: ', indexref)
#INDEX = indexref['href']
#INDEX = 'http://www1.folha.uol.com.br/'+INDEX
INDEX = 'http://www1.folha.uol.com.br/' + 'fsp/indices/index-' + str(self.today).replace('-','') + '.shtml'
self.log('--> INDEX after extracting href and adding prefix: ', INDEX)
# ... and taking the opportunity to get the cover image link
# coverurl = hpsoup.find('a', href=re.compile('^cp.*'))['href']
coverurl = 'cp' + str(today.day).zfill(2) + str(today.month).zfill(2) + str(today.year) + '.shtml'
#coverurl = hpsoup.find('a', href=re.compile('^cp.*'))['href']
coverurl = self.FIRSTPAGE
if coverurl:
self.log('--> tag containing the today s cover: ', coverurl)
coverurl = coverurl.replace('shtml', 'jpg')
@ -72,35 +123,37 @@ class FSP(BasicNewsRecipe):
self.log('--> coverurl after extracting href and adding prefix: ', coverurl)
self.cover_url = coverurl
# soup = self.index_to_soup(self.INDEX)
#soup = self.index_to_soup(self.INDEX)
soup = self.index_to_soup(INDEX)
feeds = []
articles = []
section_title = "Preambulo"
section_title = u'Primeira p\xe1gina'
for post in soup.findAll('a'):
# if name=True => new section
strpost = str(post)
# if strpost.startswith('<a name'):
if re.match('<a href="/fsp/.*/index-' + str(today).replace('-','') + '.shtml"><span class="', strpost):
#if strpost.startswith('<a name'):
if re.match('<a href="/fsp/.*/index-' + str(self.today).replace('-','') + '.shtml"><span class="', strpost):
if articles:
feeds.append((section_title, articles))
self.log()
self.log('--> new section found, creating old section feed: ', section_title)
# section_title = post['name']
#section_title = post['name']
section_title = self.tag_to_string(post)
if section_title in self.section_dict:
section_title = self.section_dict[section_title]
articles = []
self.log('--> new section title: ', section_title)
elif strpost.startswith('<a href="/fsp/cp'):
break
elif strpost.startswith('<a href'):
url = post['href']
# this bit is kept if they ever go back to the old format (pre Nov-2011)
#this bit is kept if they ever go back to the old format (pre Nov-2011)
if url.startswith('/fsp'):
url = 'http://www1.folha.uol.com.br'+url
#
if url.startswith('http://www1.folha.uol.com.br/fsp'):
# url = 'http://www1.folha.uol.com.br'+url
#url = 'http://www1.folha.uol.com.br'+url
title = self.tag_to_string(post)
self.log()
self.log('--> post: ', post)
@ -111,15 +164,15 @@ class FSP(BasicNewsRecipe):
feeds.append((section_title, articles))
# keeping the front page url
# minha_capa = feeds[0][1][1]['url']
#minha_capa = feeds[0][1][1]['url']
# removing the first section ('Preambulo')
del feeds[0]
#del feeds[0][1][0]
# inserting the cover page as the first article (nicer for kindle users)
# feeds.insert(0,(u'primeira p\xe1gina', [{'title':u'Primeira p\xe1gina' , 'url':minha_capa}]))
feeds.insert(0,(u'Capa', [{'title':u'Capa' , 'url':self.get_cover_url().replace('jpg', 'shtml')}]))
#feeds.insert(0,(u'primeira p\xe1gina', [{'title':u'Primeira p\xe1gina' , 'url':minha_capa}]))
#feeds[0][1].insert(0,{'title':u'fac-s\xedmile da capa' , 'url':self.HOMEPAGE+self.FIRSTPAGE})
return feeds

View File

@ -1,3 +1,4 @@
__license__ = 'GPL v3'
__copyright__ = '2011, Darko Miletic <darko.miletic at gmail.com>'
'''
@ -5,7 +6,6 @@ frontlineonnet.com
'''
import re
from calibre import strftime
from calibre.web.feeds.news import BasicNewsRecipe
class Frontlineonnet(BasicNewsRecipe):
@ -18,7 +18,7 @@ class Frontlineonnet(BasicNewsRecipe):
delay = 1
INDEX = 'http://frontlineonnet.com/'
use_embedded_content = False
encoding = 'cp1252'
encoding = 'utf-8'
language = 'en_IN'
publication_type = 'magazine'
masthead_url = 'http://frontlineonnet.com/images/newfline.jpg'
@ -45,37 +45,36 @@ class Frontlineonnet(BasicNewsRecipe):
]
keep_only_tags= [
dict(name='font', attrs={'class':'storyhead'})
,dict(attrs={'class':'byline'})
dict(name='div', attrs={'id':'content'})
#,dict(attrs={'class':'byline'})
]
remove_attributes=['size','noshade','border']
#remove_attributes=['size','noshade','border']
def preprocess_html(self, soup):
for item in soup.findAll(style=True):
del item['style']
for item in soup.findAll('img'):
if not item.has_key('alt'):
item['alt'] = 'image'
return soup
#def preprocess_html(self, soup):
#for item in soup.findAll(style=True):
#del item['style']
#for item in soup.findAll('img'):
#if not item.has_key('alt'):
#item['alt'] = 'image'
#return soup
def parse_index(self):
articles = []
soup = self.index_to_soup(self.INDEX)
for feed_link in soup.findAll('a',href=True):
if feed_link['href'].startswith('stories/'):
url = self.INDEX + feed_link['href']
title = self.tag_to_string(feed_link)
date = strftime(self.timefmt)
articles.append({
'title' :title
,'date' :date
,'url' :url
,'description':''
})
for feed_link in soup.findAll('div', id='headseccol'):
a = feed_link.find('a', href=True)
title = self.tag_to_string(a)
url = a['href']
articles.append({
'title' :title
,'date' :''
,'url' :url
,'description':''
})
return [('Frontline', articles)]
def print_version(self, url):
return "http://www.hinduonnet.com/thehindu/thscrip/print.pl?prd=fline&file=" + url.rpartition('/')[2]
#def print_version(self, url):
#return "http://www.hinduonnet.com/thehindu/thscrip/print.pl?prd=fline&file=" + url.rpartition('/')[2]
def image_url_processor(self, baseurl, url):
return url.replace('../images/', self.INDEX + 'images/').strip()
#def image_url_processor(self, baseurl, url):
#return url.replace('../images/', self.INDEX + 'images/').strip()

View File

@ -1,35 +0,0 @@
import urllib, re, mechanize
from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre import __appname__
class GoogleReader(BasicNewsRecipe):
title = 'Google Reader'
description = 'This recipe fetches from your Google Reader account unread Starred items and unread Feeds you have placed in a folder via the manage subscriptions feature.'
needs_subscription = True
__author__ = 'davec, rollercoaster, Starson17'
base_url = 'http://www.google.com/reader/atom/'
oldest_article = 365
max_articles_per_feed = 250
get_options = '?n=%d&xt=user/-/state/com.google/read' % max_articles_per_feed
use_embedded_content = True
def get_browser(self):
br = BasicNewsRecipe.get_browser(self)
if self.username is not None and self.password is not None:
request = urllib.urlencode([('Email', self.username), ('Passwd', self.password),
('service', 'reader'), ('accountType', 'HOSTED_OR_GOOGLE'), ('source', __appname__)])
response = br.open('https://www.google.com/accounts/ClientLogin', request)
auth = re.search('Auth=(\S*)', response.read()).group(1)
cookies = mechanize.CookieJar()
br = mechanize.build_opener(mechanize.HTTPCookieProcessor(cookies))
br.addheaders = [('Authorization', 'GoogleLogin auth='+auth)]
return br
def get_feeds(self):
feeds = []
soup = self.index_to_soup('http://www.google.com/reader/api/0/tag/list')
for id in soup.findAll(True, attrs={'name':['id']}):
url = id.contents[0]
feeds.append((re.search('/([^/]*)$', url).group(1),
self.base_url + urllib.quote(url.encode('utf-8')) + self.get_options))
return feeds

View File

@ -1,35 +0,0 @@
import urllib, re, mechanize
from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre import __appname__
class GoogleReaderUber(BasicNewsRecipe):
title = 'Google Reader uber'
description = 'Fetches all feeds from your Google Reader account including the uncategorized items.'
needs_subscription = True
__author__ = 'davec, rollercoaster, Starson17'
base_url = 'http://www.google.com/reader/atom/'
oldest_article = 365
max_articles_per_feed = 250
get_options = '?n=%d&xt=user/-/state/com.google/read' % max_articles_per_feed
use_embedded_content = True
def get_browser(self):
br = BasicNewsRecipe.get_browser(self)
if self.username is not None and self.password is not None:
request = urllib.urlencode([('Email', self.username), ('Passwd', self.password),
('service', 'reader'), ('accountType', 'HOSTED_OR_GOOGLE'), ('source', __appname__)])
response = br.open('https://www.google.com/accounts/ClientLogin', request)
auth = re.search('Auth=(\S*)', response.read()).group(1)
cookies = mechanize.CookieJar()
br = mechanize.build_opener(mechanize.HTTPCookieProcessor(cookies))
br.addheaders = [('Authorization', 'GoogleLogin auth='+auth)]
return br
def get_feeds(self):
feeds = []
soup = self.index_to_soup('http://www.google.com/reader/api/0/tag/list')
for id in soup.findAll(True, attrs={'name':['id']}):
url = id.contents[0].replace('broadcast','reading-list')
feeds.append((re.search('/([^/]*)$', url).group(1),
self.base_url + urllib.quote(url.encode('utf-8')) + self.get_options))
return feeds

View File

@ -1,5 +1,4 @@
__license__ = 'GPL v3'
__copyright__ = '2008-2011, Darko Miletic <darko.miletic at gmail.com>'
__copyright__ = '2008-2013, Darko Miletic <darko.miletic at gmail.com>'
'''
lanacion.com.ar
'''
@ -45,36 +44,32 @@ class Lanacion(BasicNewsRecipe):
remove_tags_after = dict(attrs={'id':'relacionadas'})
feeds = [
(u'Politica' , u'http://servicios.lanacion.com.ar/herramientas/rss/categoria_id=30' )
,(u'Deportes' , u'http://servicios.lanacion.com.ar/herramientas/rss/categoria_id=131' )
,(u'Economia' , u'http://servicios.lanacion.com.ar/herramientas/rss/categoria_id=272' )
,(u'Informacion General' , u'http://servicios.lanacion.com.ar/herramientas/rss/categoria_id=21' )
,(u'Cultura' , u'http://servicios.lanacion.com.ar/herramientas/rss/categoria_id=1' )
,(u'Opinion' , u'http://servicios.lanacion.com.ar/herramientas/rss/categoria_id=28' )
,(u'Espectaculos' , u'http://servicios.lanacion.com.ar/herramientas/rss/categoria_id=120' )
,(u'Exterior' , u'http://servicios.lanacion.com.ar/herramientas/rss/categoria_id=7' )
,(u'Ciencia&Salud' , u'http://servicios.lanacion.com.ar/herramientas/rss/categoria_id=498' )
,(u'Revista' , u'http://servicios.lanacion.com.ar/herramientas/rss/categoria_id=494' )
,(u'Enfoques' , u'http://servicios.lanacion.com.ar/herramientas/rss/categoria_id=421' )
,(u'Comercio Exterior' , u'http://servicios.lanacion.com.ar/herramientas/rss/categoria_id=347' )
,(u'Tecnologia' , u'http://servicios.lanacion.com.ar/herramientas/rss/categoria_id=432' )
,(u'Arquitectura' , u'http://servicios.lanacion.com.ar/herramientas/rss/categoria_id=366' )
,(u'Turismo' , u'http://servicios.lanacion.com.ar/herramientas/rss/categoria_id=504' )
,(u'Al volante' , u'http://servicios.lanacion.com.ar/herramientas/rss/categoria_id=371' )
,(u'El Campo' , u'http://servicios.lanacion.com.ar/herramientas/rss/categoria_id=337' )
,(u'Moda y Belleza' , u'http://servicios.lanacion.com.ar/herramientas/rss/categoria_id=1312')
,(u'Inmuebles Comerciales', u'http://servicios.lanacion.com.ar/herramientas/rss/categoria_id=1363')
,(u'Countries' , u'http://servicios.lanacion.com.ar/herramientas/rss/categoria_id=1348')
,(u'adnCultura' , u'http://servicios.lanacion.com.ar/herramientas/rss/categoria_id=6734')
,(u'The WSJ Americas' , u'http://servicios.lanacion.com.ar/herramientas/rss/categoria_id=6373')
,(u'Comunidad' , u'http://servicios.lanacion.com.ar/herramientas/rss/categoria_id=1344')
,(u'Management' , u'http://servicios.lanacion.com.ar/herramientas/rss/categoria_id=7380')
,(u'Bicentenario' , u'http://servicios.lanacion.com.ar/herramientas/rss/categoria_id=7276')
(u'Politica' , u'http://lanacion.com.ar.feedsportal.com/politica' )
,(u'Deportes' , u'http://lanacion.com.ar.feedsportal.com/deportes' )
,(u'Economia' , u'http://lanacion.com.ar.feedsportal.com/economia' )
,(u'Sociedad' , u'http://lanacion.com.ar.feedsportal.com/sociedad' )
,(u'Seguridad' , u'http://lanacion.com.ar.feedsportal.com/seguridad' )
,(u'Buenos Aires' , u'http://lanacion.com.ar.feedsportal.com/buenosaires' )
,(u'Opinion' , u'http://lanacion.com.ar.feedsportal.com/opinion' )
,(u'Espectaculos' , u'http://lanacion.com.ar.feedsportal.com/espectaculos' )
,(u'El Mundo' , u'http://lanacion.com.ar.feedsportal.com/mundo' )
,(u'Revista' , u'http://lanacion.com.ar.feedsportal.com/revistalanacion' )
,(u'Enfoques' , u'http://lanacion.com.ar.feedsportal.com/enfoques' )
,(u'Comercio Exterior' , u'http://lanacion.com.ar.feedsportal.com/comercioexterior' )
,(u'Tecnologia' , u'http://lanacion.com.ar.feedsportal.com/tecnologia' )
,(u'Turismo' , u'http://lanacion.com.ar.feedsportal.com/turismo' )
,(u'Al volante' , u'http://lanacion.com.ar.feedsportal.com/alvolante' )
,(u'El Campo' , u'http://lanacion.com.ar.feedsportal.com/elcampo' )
,(u'Moda y Belleza' , u'http://lanacion.com.ar.feedsportal.com/modaybelleza' )
,(u'Inmuebles Comerciales', u'http://lanacion.com.ar.feedsportal.com/inmueblescomerciales' )
,(u'Countries' , u'http://lanacion.com.ar.feedsportal.com/countries' )
,(u'adnCultura' , u'http://lanacion.com.ar.feedsportal.com/adncultura' )
,(u'The WSJ Americas' , u'http://lanacion.com.ar.feedsportal.com/wallstreetjournalamericas')
]
def get_article_url(self, article):
link = BasicNewsRecipe.get_article_url(self,article)
link = article.get('guid', None)
if link.startswith('http://blogs.lanacion') and not link.endswith('/'):
return self.browser.open_novisit(link).geturl()
if link.rfind('galeria=') > 0:

View File

@ -1,5 +1,6 @@
from calibre.web.feeds.news import BasicNewsRecipe
class AdvancedUserRecipe1274742400(BasicNewsRecipe):
title = u'Las Vegas Review Journal'
@ -9,24 +10,24 @@ class AdvancedUserRecipe1274742400(BasicNewsRecipe):
oldest_article = 7
max_articles_per_feed = 100
#keep_only_tags = [dict(id='content-main')]
#remove_tags = [dict(id=['right-col-content', 'trending-topics']),
# keep_only_tags = [dict(id='content-main')]
# remove_tags = [dict(id=['right-col-content', 'trending-topics']),
#{'class':['ppy-outer']}
#]
no_stylesheets = True
use_embedded_content = False
auto_cleanup = True
feeds = [
(u'News', u'http://www.lvrj.com/news.rss'),
(u'Business', u'http://www.lvrj.com/business.rss'),
(u'Living', u'http://www.lvrj.com/living.rss'),
(u'Opinion', u'http://www.lvrj.com/opinion.rss'),
(u'Neon', u'http://www.lvrj.com/neon.rss'),
#(u'Image', u'http://www.lvrj.com/image.rss'),
#(u'Home & Garden', u'http://www.lvrj.com/home_and_garden.rss'),
#(u'Furniture & Design', u'http://www.lvrj.com/furniture_and_design.rss'),
#(u'Drive', u'http://www.lvrj.com/drive.rss'),
#(u'Real Estate', u'http://www.lvrj.com/real_estate.rss'),
(u'Sports', u'http://www.lvrj.com/sports.rss')]
(u'Top Stories', u'http://www.reviewjournal.com/rss.xml'),
(u'News', u'http://www.reviewjournal.com/news/feed'),
(u'Business', u'http://www.reviewjournal.com/business/feed'),
(u'Living', u'http://www.reviewjournal.com/living/feed'),
(u'Opinion', u'http://www.reviewjournal.com/opinion/feed'),
(u'Neon', u'http://www.reviewjournal.com/neon/feed'),
#(u'Image', u'http://www.lvrj.com/image.rss'),
#(u'Home & Garden', u'http://www.lvrj.com/home_and_garden.rss'),
#(u'Furniture & Design', u'http://www.lvrj.com/furniture_and_design.rss'),
#(u'Drive', u'http://www.lvrj.com/drive.rss'),
#(u'Real Estate', u'http://www.lvrj.com/real_estate.rss'),
(u'Sports', u'http://www.reviewjournal.com/sports/feed')]

View File

@ -39,6 +39,8 @@ from BeautifulSoup import BeautifulSoup
Version 1.9.4 19-04-2013
Added regex filter for mailto
Updated for new layout of metro-site
Version 1.9.5 28-05-2013
Added some extra id's and classes to remove
'''
class AdvancedUserRecipe1306097511(BasicNewsRecipe):
@ -46,7 +48,7 @@ class AdvancedUserRecipe1306097511(BasicNewsRecipe):
oldest_article = 1.2
max_articles_per_feed = 25
__author__ = u'DrMerry'
description = u'Metro Nederland v1.9.4 2013-04-19'
description = u'Metro Nederland v1.9.5 2013-05-28, Download nieuws van de Nederlandse editie van de krant Metro'
language = u'nl'
simultaneous_downloads = 5
masthead_url = 'http://blog.metronieuws.nl/wp-content/themes/metro/images/header.gif'
@ -70,7 +72,7 @@ class AdvancedUserRecipe1306097511(BasicNewsRecipe):
#(re.compile(r'<(a |/a)[^>]*>', re.DOTALL|re.IGNORECASE),lambda match:'')
#(re.compile('(</?)h2', re.DOTALL|re.IGNORECASE),lambda match:'\1em')
]
remove_tags_before= dict(id='subwrapper')
remove_tags_after = dict(name='div', attrs={'class':['body-area','article-main-area']})
#name='div', attrs={'class':['subwrapper']})]
@ -80,13 +82,13 @@ class AdvancedUserRecipe1306097511(BasicNewsRecipe):
remove_tags = [
dict(name=['iframe','script','noscript','style']),
dict(name='div', attrs={'class':['aside clearfix','aside clearfix middle-col-line','comments','share-tools','article-right-column','column-4-5','column-1-5','ad-msg','col-179 ','col-373 ','clear','ad','navigation',re.compile('share-tools(-top)?'),'tools','metroCommentFormWrap','article-tools-below-title','related-links','padding-top-15',re.compile('^promo.*?$'),'teaser-component',re.compile('fb(-comments|_iframe_widget)'),'promos','header-links','promo-2']}),
dict(id=['article-2','googleads','column-1-5-bottom','column-4-5',re.compile('^ad(\d+|adcomp.*?)?$'),'adadcomp-4','margin-5','sidebar',re.compile('^article-\d'),'comments','gallery-1','sharez_container','ts-container','topshares','ts-title']),
dict(name='div', attrs={'class':['fact-related-box','aside clearfix','aside clearfix middle-col-line','comments','share-tools','article-right-column','column-4-5','column-1-5','ad-msg','col-179 ','col-373 ','clear','ad','navigation',re.compile('share-tools(-top)?'),'tools','metroCommentFormWrap','article-tools-below-title','related-links','padding-top-15',re.compile('^promo.*?$'),'teaser-component',re.compile('fb(-comments|_iframe_widget)'),'promos','header-links','promo-2']}),
dict(id=['super-carousel','article-2','googleads','column-1-5-bottom','column-4-5',re.compile('^ad(\d+|adcomp.*?)?$'),'adadcomp-4','margin-5','sidebar',re.compile('^article-\d'),'comments','gallery-1','sharez_container','ts-container','topshares','ts-title']),
dict(name='a', attrs={'name':'comments'}),
#dict(name='div', attrs={'data-href'}),
dict(name='img', attrs={'class':'top-line','title':'volledig scherm'}),
dict(attrs={'style':re.compile('^(.*(display\s?:\s?none|img-mask|white)\s?;?.*)$'),'title':'volledig scherm'})]
'''removed by before/after:
id:
column-1-5-top,'hidden_div','footer',
@ -182,7 +184,7 @@ class MerryProcess(BeautifulSoup):
except:
pass
return soup
def moveTitleAndAuthor(self, soup):
moveitem = soup.h1
pubdate = soup.find(id="date")
@ -218,4 +220,4 @@ class MerryProcess(BeautifulSoup):
self.removeArrayOfTags(emptytags)
#recursive in case removing empty tag creates new empty tag
self.removeEmptyTags(soup, run=run)
return soup
return soup

View File

@ -0,0 +1,49 @@
# vim:fileencoding=utf-8
from calibre.web.feeds.news import BasicNewsRecipe
class AdvancedUserRecipe1344926684(BasicNewsRecipe):
title = u'Neue Osnabrücker Zeitung'
__author__ = 'Krittika Goyal'
oldest_article = 7
max_articles_per_feed = 100
# auto_cleanup = True
no_stylesheets = True
use_embedded_content = False
language = 'de'
remove_javascript = True
keep_only_tags = [
dict(name='div', attrs={'class':'article'}),
dict(name='span', attrs={'id':'articletext'})
]
remove_tags = [
dict(name='div', attrs={'id':'retresco-title'}),
dict(name='div', attrs={'class':'retresco-item s1 relative'}),
dict(name='a', attrs={'class':'medium2 largeSpaceTop icon'}),
dict(name='div', attrs={'class':'articleFunctions inlineTeaserRight'}),
dict(name='div', attrs={'class':'imageContainer '}),
dict(name='div', attrs={'class':'imageContainer centerContainer'}),
dict(name='div', attrs={'class':'grid singleCol articleTeaser'}),
dict(name='h3', attrs={'class':'teaserRow'}),
dict(name='div', attrs={'class':'related-comments'}),
dict(name='a', attrs={'class':' icon'}),
dict(name='a', attrs={'class':'right small'}),
dict(name='span', attrs={'class':'small block spaceBottom rectangleAd'}),
dict(name='div', attrs={'class':'furtherGalleries largeSpaceTop'})
]
feeds = [(u'Lokales', u'http://www.noz.de/rss/Lokales'),
(u'Vermischtes', u'http://www.noz.de/rss/Vermischtes'),
(u'Politik', u'http://www.noz.de/rss/Politik'),
(u'Wirtschaft', u'http://www.noz.de/rss/Wirtschaft'),
(u'Kultur', u'http://www.noz.de/rss/Kultur'),
(u'Medien', u'http://www.noz.de/rss/Medien'),
(u'Wissenschaft', u'http://www.noz.de/rss/wissenschaft'),
(u'Sport', u'http://www.noz.de/rss/Sport'),
(u'Computer', u'http://www.noz.de/rss/Computer'),
(u'Musik', u'http://www.noz.de/rss/Musik'),
(u'Szene', u'http://www.noz.de/rss/Szene'),
(u'Niedersachsen', u'http://www.noz.de/rss/Niedersachsen'),
(u'Kino', u'http://www.noz.de/rss/Kino')]

View File

@ -29,24 +29,22 @@ class NYTimesSports(BasicNewsRecipe):
category = 'Sports'
oldest_article = 3
max_articles_per_feed = 25
use_embedded_content = False
no_stylesheets = True
language = 'en'
#cover_url ='http://bit.ly/h8F4DO'
auto_cleanup = True
auto_cleanup_keep = '//div[@class="articleSpanImage"]'
feeds = [
(u'The Fifth Down', u'http://fifthdown.blogs.nytimes.com/feed/'),
(u'Off The Dribble', u'http://offthedribble.blogs.nytimes.com/feed/'),
(u'The Quad', u'http://thequad.blogs.nytimes.com/feed/'),
(u'Slap Shot', u'http://slapshot.blogs.nytimes.com/feed/'),
(u'Goal', u'http://goal.blogs.nytimes.com/feed/'),
(u'Bats', u'http://bats.blogs.nytimes.com/feed/'),
(u'Straight Sets', u'http://straightsets.blogs.nytimes.com/feed/'),
(u'Formula One', u'http://formulaone.blogs.nytimes.com/feed/'),
(u'On Par', u'http://onpar.blogs.nytimes.com/feed/'),
]
keep_only_tags = [dict(name='div', attrs={'id':'header'}),
dict(name='h1'),
dict(name='h2'),
dict(name='div', attrs={'class':'entry-content'})]
(u'The Fifth Down', u'http://fifthdown.blogs.nytimes.com/feed/'),
(u'Off The Dribble', u'http://offthedribble.blogs.nytimes.com/feed/'),
(u'The Quad', u'http://thequad.blogs.nytimes.com/feed/'),
(u'Slap Shot', u'http://slapshot.blogs.nytimes.com/feed/'),
(u'Goal', u'http://goal.blogs.nytimes.com/feed/'),
(u'Bats', u'http://bats.blogs.nytimes.com/feed/'),
(u'Straight Sets', u'http://straightsets.blogs.nytimes.com/feed/'),
(u'Formula One', u'http://formulaone.blogs.nytimes.com/feed/'),
(u'On Par', u'http://onpar.blogs.nytimes.com/feed/'),
]
extra_css = '''
h1{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:large;}
h2{font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:small;}

View File

@ -0,0 +1,35 @@
# -*- coding: utf-8 -*-
from calibre.web.feeds.news import BasicNewsRecipe
import re
__license__ = 'GPL v3'
class SeventhGuard(BasicNewsRecipe):
title = '7th Guard'
description= u""" 7thGuard.net jest jednym z najstarszych w polskiej części internetu serwisem poświęconym otwartości standardów,
wolności oprogramowania, szeroko pojętych wolności w internecie, walce z praktykami monopolistycznymi w świecie informatyki oraz
co jest coraz ważniejsze w dzisiejszym świecie rozwojowi społeczeństwa informacyjnego."""
__author__ = 'koliberek'
masthead_url='http://beta.7thguard.net/wp-content/uploads/2013/05/7thguard-bladerunner-logo1.png'
cover_url = 'http://beta.7thguard.net/wp-content/uploads/2013/05/7thguard-bladerunner-logo1.png'
language = 'pl'
oldest_article = 7
max_articles_per_feed = 100
timefmt = ' [%A, %d %B %Y]'
auto_cleanup = True
remove_javascript=True
no_stylesheets = True
remove_empty_feeds = True
conversion_options = {'smarten_punctuation' : True}
extra_css="""h2 {font-size:12pt; font-family:Arial,Helvetica,sans serif;}
h1 {font-size:14pt; font-family:Arial,Helvetica,sans serif;}
p {text-align:justify;}
.article, .feed, .calibre_feed_description, .article_description {font-family:Arial,Helvetica,sans serif;}
.article_description {text-style:italic;}
"""
remove_attributes = ['border', 'cellspacing', 'align', 'cellpadding', 'colspan', 'valign', 'vspace', 'hspace', 'alt', 'width', 'height', 'font']
preprocess_regexps = [(re.compile(r'<br[ ]*/>', re.IGNORECASE), lambda m: ''), (re.compile(r'<br[ ]*clear.*/>', re.IGNORECASE), lambda m: '')]
feeds = [(u'Aktualno\u015bci', u'http://7thguard.net/feed/')]

View File

@ -1,77 +1,67 @@
#!/usr/bin/env python
from __future__ import (unicode_literals, division, absolute_import,
print_function)
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid@kovidgoyal.net>'
__copyright__ = '2013, Kovid Goyal <kovid@kovidgoyal.net>'
'''
time.com
'''
import re
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.web.feeds.jsnews import JavascriptRecipe
from lxml import html
class Time(BasicNewsRecipe):
def wait_for_load(browser):
# This element is present in the black login bar at the top
browser.wait_for_element('#site-header p.constrain', timeout=180)
# Keep the login method as standalone, so it can be easily tested
def do_login(browser, username, password):
from calibre.web.jsbrowser.browser import Timeout
browser.visit('http://www.time.com/time/magazine')
form = browser.select_form('#magazine-signup')
form['username'] = username
form['password'] = password
browser.submit('#paid-wall-submit')
try:
wait_for_load(browser)
except Timeout:
raise ValueError('Failed to login to time.com, check your username and password and try again in a little while.')
class Time(JavascriptRecipe):
title = u'Time'
__author__ = 'Kovid Goyal, Rick Shang'
description = ('Weekly US magazine.')
encoding = 'utf-8'
__author__ = 'Kovid Goyal'
description = 'Weekly US magazine.'
language = 'en'
needs_subscription = True
requires_version = (0, 9, 35)
no_stylesheets = True
language = 'en'
remove_javascript = True
needs_subscription = True
keep_only_tags = ['article.post']
remove_tags = ['meta', '.entry-sharing', '.entry-footer', '.wp-paginate',
'.post-rail', '.entry-comments', '.entry-tools',
'#paid-wall-cm-ad']
keep_only_tags = [
{
'class':['primary-col', 'tout1']
},
]
remove_tags = [
{'class':['button', 'entry-sharing group', 'wp-paginate',
'moving-markup', 'entry-comments']},
recursions = 1
links_from_selectors = ['.wp-paginate a.page[href]']
]
extra_css = '.entry-date { padding-left: 2ex }'
preprocess_regexps = [(re.compile(
r'<meta .+/>'), lambda m:'')]
def do_login(self, browser, username, password):
do_login(browser, username, password)
def get_browser(self):
br = BasicNewsRecipe.get_browser(self)
# This site uses javascript in its login process
if self.username is not None and self.password is not None:
br.open('http://www.time.com/time/magazine')
br.select_form(predicate=lambda f: 'action' in f.attrs and f.attrs['action'] == 'https://auth.time.com/login.php')
br['username'] = self.username
br['password'] = self.password
# br['magcode'] = ['TD']
br.find_control('turl').readonly = False
br['turl'] = 'http://www.time.com/time/magazine'
br.find_control('rurl').readonly = False
br['rurl'] = 'http://www.time.com/time/magazine'
br['remember'] = False
raw = br.submit().read()
if False and '>Log Out<' not in raw:
# This check is disabled as it does not work (there is probably
# some cookie missing) however, the login is "sufficient" for
# the actual article downloads to work.
raise ValueError('Failed to login to time.com, check'
' your username and password')
return br
def parse_index(self):
raw = self.index_to_soup('http://www.time.com/time/magazine', raw=True)
def get_publication_data(self, browser):
selector = 'section.sec-mag-showcase ul.ul-mag-showcase img[src]'
cover = browser.css_select(selector)
# URL for large cover
cover_url = unicode(cover.evaluateJavaScript('this.src').toString()).replace('_400.', '_600.')
raw = browser.html
ans = {'cover': browser.get_resource(cover_url)}
# We are already at the magazine page thanks to the do_login() method
root = html.fromstring(raw)
img = root.xpath('//a[.="View Large Cover" and @href]')
if img:
cover_url = 'http://www.time.com' + img[0].get('href')
try:
nsoup = self.index_to_soup(cover_url)
img = nsoup.find('img', src=re.compile('archive/covers'))
if img is not None:
self.cover_url = img['src']
except:
self.log.exception('Failed to fetch cover')
dates = ''.join(root.xpath('//time[@class="updated"]/text()'))
if dates:
@ -90,27 +80,22 @@ class Time(BasicNewsRecipe):
if articles:
feeds.append((section, articles))
return feeds
ans['index'] = feeds
return ans
def find_articles(self, sec):
for article in sec.xpath('./article'):
h2 = article.xpath('./*[@class="entry-title"]')
if not h2: continue
if not h2:
continue
a = h2[0].xpath('./a[@href]')
if not a: continue
if not a:
continue
title = html.tostring(a[0], encoding=unicode,
method='text').strip()
if not title: continue
if not title:
continue
url = a[0].get('href')
if url.startswith('/'):
url = 'http://www.time.com'+url
if '/article/0,' in url:
soup = self.index_to_soup(url)
a = soup.find('a', href=lambda x:x and '/printout/' in x)
url = a['href'].replace('/printout', '/subscriber/printout')
else:
url += 'print/' if url.endswith('/') else '/print/'
if url.startswith('/'):
url = 'http://www.time.com'+url
desc = ''
@ -126,10 +111,35 @@ class Time(BasicNewsRecipe):
'description' : desc
}
def preprocess_html(self, soup):
for fig in soup.findAll('figure'):
img = fig.find('img')
if img is not None:
fig.replaceWith(img)
return soup
def load_complete(self, browser, url, recursion_level):
# This is needed as without it, subscriber content is blank. time.com
# appears to be using some crazy iframe+js callback for loading content
wait_for_load(browser)
return True
def postprocess_html(self, article, root, url, recursion_level):
# Remove the header and page n of m messages from pages after the first
# page
if recursion_level > 0:
for h in root.xpath('//header[@class="entry-header"]|//span[@class="page"]'):
h.getparent().remove(h)
# Unfloat the article images and also remove them from pages after the
# first page as they are repeated on every page.
for fig in root.xpath('//figure'):
parent = fig.getparent()
if recursion_level > 0:
parent.remove(fig)
else:
idx = parent.index(fig)
for img in reversed(fig.xpath('descendant::img')):
parent.insert(idx, img)
parent.remove(fig)
return root
if __name__ == '__main__':
# Test the login
import sys
from calibre import jsbrowser
br = jsbrowser(default_timeout=120)
do_login(br, sys.argv[-2], sys.argv[-1])
br.show_browser()

View File

@ -1,74 +1,58 @@
import re, urllib
# vim:fileencoding=utf-8
from calibre.web.feeds.news import BasicNewsRecipe
from lxml import html
allowed_sections = {'Top Headlines', 'Opinion', 'Science', 'Education', 'US', 'Pakistan', 'India Business', 'Tech News', 'Cricket', 'Bollywood'}
class TimesOfIndia(BasicNewsRecipe):
title = u'Times of India'
language = 'en_IN'
title = u'Times of India Headlines'
language = 'en'
description = 'Headline news from the Indian daily Times of India'
__author__ = 'Kovid Goyal'
oldest_article = 1 #days
max_articles_per_feed = 25
no_stylesheets = True
remove_attributes = ['style']
keep_only_tags = [
{'class':re.compile(r'maintable12|prttabl')},
{'id':['mod-article-header',
'mod-a-body-after-first-para', 'mod-a-body-first-para']},
]
no_javascript = True
keep_only_tags = [dict(name='h1'), dict(id=['storydiv', 'contentarea'])]
remove_tags = [
{'class':re.compile('tabsintbgshow|prvnxtbg')},
{'id':['fbrecommend', 'relmaindiv', 'shretxt', 'fbrecos', 'twtdiv',
'gpls', 'auim']},
{'class':['twitter-share-button', 'cmtmn']},
]
dict(name='div', attrs={'class':['video_list', 'rightpart', 'clearfix mTop15', 'footer_slider', 'read_more', 'flR', 'hide_new']}),
dict(name='div', attrs={'id':[
'most_pop', 'relartstory', 'slidebox', 'tmpFbokk', 'twittersource',
'reportAbuseDiv', 'result', 'yahoobuzzsyn', 'fb-root']}),
dict(style='float:right;margin-left:5px;'),
]
feeds = [
('Top Stories',
'http://timesofindia.indiatimes.com/rssfeedstopstories.cms'),
('India',
'http://timesofindia.indiatimes.com/rssfeeds/-2128936835.cms'),
('World',
'http://timesofindia.indiatimes.com/rssfeeds/296589292.cms'),
('Mumbai',
'http://timesofindia.indiatimes.com/rssfeeds/-2128838597.cms'),
('Entertainment',
'http://timesofindia.indiatimes.com/rssfeeds/1081479906.cms'),
('Cricket',
'http://timesofindia.indiatimes.com/rssfeeds/4719161.cms'),
('Sunday TOI',
'http://timesofindia.indiatimes.com/rssfeeds/1945062111.cms'),
('Life and Style',
'http://timesofindia.indiatimes.com/rssfeeds/2886704.cms'),
('Business',
'http://timesofindia.indiatimes.com/rssfeeds/1898055.cms'),
('Mad Mad World',
'http://timesofindia.indiatimes.com/rssfeeds/2178430.cms'),
('Most Read',
'http://timesofindia.indiatimes.com/rssfeedmostread.cms')
]
def parse_index(self):
index = 'http://timesofindia.indiatimes.com/home/headlines'
raw = self.index_to_soup(index, raw=True)
root = html.fromstring(raw)
feeds = []
current_section = None
current_articles = []
toc = root.xpath('//div[@align="center"]/descendant::table[@class="cnt"]')[0]
for x in toc.xpath('descendant::*[name()="h3" or (name()="ul" and @class="content")]'):
if x.tag == 'h3':
if current_articles and current_section in allowed_sections:
feeds.append((current_section, current_articles))
current_section = html.tostring(x, method='text', encoding=unicode).strip()
current_articles = []
self.log(current_section)
else:
for a in x.xpath('descendant::li/descendant::a[@href]'):
title = html.tostring(a, method='text', encoding=unicode).strip()
url = a.get('href')
if url.startswith('/'):
url = 'http://timesofindia.indiatimes.com' + url
self.log(' ', title)
current_articles.append({'title':title, 'url':url})
self.log('')
if current_articles and current_section in allowed_sections:
feeds.append((current_section, current_articles))
return feeds
def get_article_url(self, article):
try:
s = article.summary
return urllib.unquote(
re.search(r'href=".+?bookmark.cfm.+?link=(.+?)"', s).group(1))
except:
pass
link = article.get('link', None)
if link and link.split('/')[-1]=="story01.htm":
link=link.split('/')[-2]
encoding = {'0B': '.', '0C': '/', '0A': '0', '0F': '=', '0G': '&',
'0D': '?', '0E': '-', '0N': '.com', '0L': 'http://'}
for k, v in encoding.iteritems():
link = link.replace(k, v)
return link
def print_version(self, url):
return url + '?prtpage=1'
def preprocess_html(self, soup, *args):
byl = soup.find(attrs={'class':'byline'})
if byl is not None:
for l in byl.findAll('label'):
l.extract()
return soup

View File

@ -1,3 +1,6 @@
" Scan the following dirs (recursively for tags
let g:project_tags_dirs = ['src/calibre']
" Include directories for C++ modules
let g:syntastic_cpp_include_dirs = [
\'/usr/include/python2.7',
@ -27,7 +30,7 @@ fun! CalibreLog()
hi def link au Keyword
syntax match au /^.*:::$/
nnoremap <silent> <buffer> n :call cursor(1+search('\V:::\$', 'n'), 0)<CR>
nnoremap <silent> <buffer> yb vt#t<Space>y
nnoremap <silent> <buffer> yb v/#<CR>t<Space>y:nohl<CR>
normal! gg2j
edit Changelog.yaml
edit src/calibre/constants.py

View File

@ -63,7 +63,7 @@ def upload_signatures():
shell=True)
shutil.rmtree(tdir)
class ReUpload(Command): # {{{
class ReUpload(Command): # {{{
description = 'Re-uplaod any installers present in dist/'
@ -118,7 +118,7 @@ def run_remote_upload(args):
# }}}
class UploadInstallers(Command): # {{{
class UploadInstallers(Command): # {{{
def add_options(self, parser):
parser.add_option('--replace', default=False, action='store_true', help=
@ -172,7 +172,7 @@ class UploadInstallers(Command): # {{{
run_remote_upload(args)
# }}}
class UploadUserManual(Command): # {{{
class UploadUserManual(Command): # {{{
description = 'Build and upload the User Manual'
sub_commands = ['manual']
@ -184,7 +184,8 @@ class UploadUserManual(Command): # {{{
with CurrentDir(path):
with ZipFile(f, 'w') as zf:
for x in os.listdir('.'):
if x.endswith('.swp'): continue
if x.endswith('.swp'):
continue
zf.write(x)
if os.path.isdir(x):
for y in os.listdir(x):
@ -203,7 +204,7 @@ class UploadUserManual(Command): # {{{
'bugs:%s'%USER_MANUAL]), shell=True)
# }}}
class UploadDemo(Command): # {{{
class UploadDemo(Command): # {{{
description = 'Rebuild and upload various demos'
@ -223,20 +224,20 @@ class UploadDemo(Command): # {{{
check_call('scp /tmp/html-demo.zip divok:%s/'%(DOWNLOADS,), shell=True)
# }}}
class UploadToServer(Command): # {{{
class UploadToServer(Command): # {{{
description = 'Upload miscellaneous data to calibre server'
def run(self, opts):
check_call('ssh divok rm -f %s/calibre-\*.tar.xz'%DOWNLOADS, shell=True)
#check_call('scp dist/calibre-*.tar.xz divok:%s/'%DOWNLOADS, shell=True)
# check_call('scp dist/calibre-*.tar.xz divok:%s/'%DOWNLOADS, shell=True)
check_call('gpg --armor --detach-sign dist/calibre-*.tar.xz',
shell=True)
check_call('scp dist/calibre-*.tar.xz.asc divok:%s/signatures/'%DOWNLOADS,
shell=True)
check_call('ssh divok /usr/local/bin/update-calibre',
shell=True)
check_call('''ssh divok echo %s \\> %s/latest_version'''\
check_call('''ssh divok echo %s \\> %s/latest_version'''
%(__version__, DOWNLOADS), shell=True)
check_call('ssh divok /etc/init.d/apache2 graceful',
shell=True)

View File

@ -408,6 +408,10 @@ def browser(honor_time=True, max_time=2, mobile_browser=False, user_agent=None,
return opener
def jsbrowser(*args, **kwargs):
from calibre.web.jsbrowser.browser import Browser
return Browser(*args, **kwargs)
def fit_image(width, height, pwidth, pheight):
'''
Fit image in box of width pwidth and height pheight.

View File

@ -4,7 +4,7 @@ __license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
__docformat__ = 'restructuredtext en'
__appname__ = u'calibre'
numeric_version = (0, 9, 33)
numeric_version = (0, 9, 34)
__version__ = u'.'.join(map(unicode, numeric_version))
__author__ = u"Kovid Goyal <kovid@kovidgoyal.net>"

View File

@ -554,6 +554,7 @@ from calibre.ebooks.conversion.plugins.txt_input import TXTInput
from calibre.ebooks.conversion.plugins.lrf_input import LRFInput
from calibre.ebooks.conversion.plugins.chm_input import CHMInput
from calibre.ebooks.conversion.plugins.snb_input import SNBInput
from calibre.ebooks.conversion.plugins.docx_input import DOCXInput
from calibre.ebooks.conversion.plugins.epub_output import EPUBOutput
from calibre.ebooks.conversion.plugins.fb2_output import FB2Output
@ -595,6 +596,7 @@ plugins += [
LRFInput,
CHMInput,
SNBInput,
DOCXInput,
]
plugins += [
EPUBOutput,

View File

@ -985,11 +985,19 @@ class DB(object):
else:
if callable(getattr(data, 'read', None)):
data = data.read()
try:
save_cover_data_to(data, path)
except (IOError, OSError):
time.sleep(0.2)
save_cover_data_to(data, path)
if data is None:
if os.path.exists(path):
try:
os.remove(path)
except (IOError, OSError):
time.sleep(0.2)
os.remove(path)
else:
try:
save_cover_data_to(data, path)
except (IOError, OSError):
time.sleep(0.2)
save_cover_data_to(data, path)
def copy_format_to(self, book_id, fmt, fname, path, dest,
windows_atomic_move=None, use_hardlink=False):

View File

@ -826,7 +826,8 @@ class Cache(object):
@write_api
def set_cover(self, book_id_data_map):
''' Set the cover for this book. data can be either a QImage,
QPixmap, file object or bytestring '''
QPixmap, file object or bytestring. It can also be None, in which
case any existing cover is removed. '''
for book_id, data in book_id_data_map.iteritems():
try:
@ -836,7 +837,8 @@ class Cache(object):
path = self._field_for('path', book_id).replace('/', os.sep)
self.backend.set_cover(book_id, path, data)
self._set_field('cover', {book_id:1 for book_id in book_id_data_map})
return self._set_field('cover', {
book_id:(0 if data is None else 1) for book_id, data in book_id_data_map.iteritems()})
@write_api
def set_metadata(self, book_id, mi, ignore_errors=False, force_changes=False,

View File

@ -24,16 +24,23 @@ if __name__ == '__main__':
args = parser.parse_args()
if args.name and args.name.startswith('.'):
tests = find_tests()
q = args.name[1:]
if not q.startswith('test_'):
q = 'test_' + q
ans = None
try:
for suite in tests:
for test in suite._tests:
for s in test:
if s._testMethodName == args.name[1:]:
tests = s
if s._testMethodName == q:
ans = s
raise StopIteration()
except StopIteration:
pass
if ans is None:
print ('No test named %s found' % args.name)
raise SystemExit(1)
tests = ans
else:
tests = unittest.defaultTestLoader.loadTestsFromName(args.name) if args.name else find_tests()
unittest.TextTestRunner(verbosity=4).run(tests)

View File

@ -355,7 +355,28 @@ class WritingTest(BaseTest):
ae(opf.authors, ['author1', 'author2'])
# }}}
def test_set_cover(self):
def test_set_cover(self): # {{{
' Test setting of cover '
self.assertTrue(False, 'TODO: test set_cover() and set_metadata()')
cache = self.init_cache()
ae = self.assertEqual
# Test removing a cover
ae(cache.field_for('cover', 1), 1)
ae(cache.set_cover({1:None}), set([1]))
ae(cache.field_for('cover', 1), 0)
img = b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x01\x00`\x00`\x00\x00\xff\xe1\x00\x16Exif\x00\x00II*\x00\x08\x00\x00\x00\x00\x00\x00\x00\x00\x00\xff\xdb\x00C\x00\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\xff\xdb\x00C\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\xff\xc0\x00\x11\x08\x00\x01\x00\x01\x03\x01"\x00\x02\x11\x01\x03\x11\x01\xff\xc4\x00\x15\x00\x01\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\n\xff\xc4\x00\x14\x10\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xff\xc4\x00\x14\x01\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xff\xc4\x00\x14\x11\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xff\xda\x00\x0c\x03\x01\x00\x02\x11\x03\x11\x00?\x00\xbf\x80\x01\xff\xd9' # noqa {{{ }}}
# Test setting a cover
ae(cache.set_cover({bid:img for bid in (1, 2, 3)}), {1, 2, 3})
old = self.init_old()
for book_id in (1, 2, 3):
ae(cache.cover(book_id), img, 'Cover was not set correctly for book %d' % book_id)
ae(cache.field_for('cover', book_id), 1)
ae(old.cover(book_id, index_is_id=True), img, 'Cover was not set correctly for book %d' % book_id)
self.assertTrue(old.has_cover(book_id))
# }}}
def test_set_metadata(self):
' Test setting of metadata '
self.assertTrue(False, 'TODO: test set_metadata()')

View File

@ -461,7 +461,7 @@ class Writer(object):
dt = field.metadata['datatype']
self.accept_vals = lambda x: True
if dt == 'composite' or field.name in {
'id', 'cover', 'size', 'path', 'formats', 'news'}:
'id', 'size', 'path', 'formats', 'news'}:
self.set_books_func = dummy
elif self.name[0] == '#' and self.name.endswith('_index'):
self.set_books_func = custom_series_index

View File

@ -152,7 +152,8 @@ def add_simple_plugin(path_to_plugin):
shutil.rmtree(tdir)
def print_basic_debug_info(out=None):
if out is None: out = sys.stdout
if out is None:
out = sys.stdout
out = functools.partial(prints, file=out)
import platform
from calibre.constants import (__appname__, get_version, isportable, isosx,
@ -175,7 +176,7 @@ def print_basic_debug_info(out=None):
def run_debug_gui(logpath):
import time
time.sleep(3) # Give previous GUI time to shutdown fully and release locks
time.sleep(3) # Give previous GUI time to shutdown fully and release locks
from calibre.constants import __appname__
prints(__appname__, _('Debug log'))
print_basic_debug_info()
@ -197,6 +198,12 @@ def run_script(path, args):
g['__file__'] = ef
execfile(ef, g)
def inspect_mobi(path):
from calibre.ebooks.mobi.debug.main import inspect_mobi
prints('Inspecting:', path)
inspect_mobi(path)
print
def main(args=sys.argv):
from calibre.constants import debug
debug()
@ -231,7 +238,7 @@ def main(args=sys.argv):
main()
elif opts.command:
sys.argv = args
exec opts.command
exec(opts.command)
elif opts.debug_device_driver:
debug_device_driver()
elif opts.add_simple_plugin is not None:
@ -246,11 +253,8 @@ def main(args=sys.argv):
sql_dump = args[-1]
reinit_db(opts.reinitialize_db, sql_dump=sql_dump)
elif opts.inspect_mobi:
from calibre.ebooks.mobi.debug.main import inspect_mobi
for path in args[1:]:
prints('Inspecting:', path)
inspect_mobi(path)
print
elif opts.tweak_book:
from calibre.ebooks.tweak import tweak
tweak(opts.tweak_book)
@ -274,6 +278,16 @@ def main(args=sys.argv):
plugin.cli_main([plugin.name] + args[1:])
elif len(args) >= 2 and args[1].rpartition('.')[-1] in {'py', 'recipe'}:
run_script(args[1], args[2:])
elif len(args) >= 2 and args[1].rpartition('.')[-1] in {'mobi', 'azw', 'azw3', 'docx'}:
for path in args[1:]:
ext = path.rpartition('.')[-1]
if ext == 'docx':
from calibre.ebooks.docx.dump import dump
dump(path)
elif ext in {'mobi', 'azw', 'azw3'}:
inspect_mobi(path)
else:
print ('Cannot dump unknown filetype: %s' % path)
else:
from calibre import ipython
ipython()
@ -282,3 +296,4 @@ def main(args=sys.argv):
if __name__ == '__main__':
sys.exit(main())

View File

@ -1174,10 +1174,10 @@ class libiMobileDevice():
self.plist_lib.plist_free(plist)
# To determine success, we need to inspect the returned plist
if hasattr(result, 'Status'):
if 'Status' in result:
if self.verbose:
self.log(" STATUS: %s" % result['Status'])
elif hasattr(result, 'Error'):
elif 'Error' in result:
if self.verbose:
self.log(" ERROR: %s" % result['Error'])
raise libiMobileDeviceException(result['Error'])
@ -1293,7 +1293,9 @@ class libiMobileDevice():
else:
index = 0
while devices[index]:
device_list.append(devices[index].contents.value)
# Filter out redundant entries
if devices[index].contents.value not in device_list:
device_list.append(devices[index].contents.value)
index += 1
if self.verbose:
self.log(" %s" % repr(device_list))

View File

@ -35,7 +35,7 @@ class KOBO(USBMS):
gui_name = 'Kobo Reader'
description = _('Communicate with the Kobo Reader')
author = 'Timothy Legge and David Forrester'
version = (2, 0, 11)
version = (2, 0, 12)
dbversion = 0
fwversion = 0
@ -1218,7 +1218,7 @@ class KOBOTOUCH(KOBO):
min_dbversion_images_on_sdcard = 77
min_dbversion_activiy = 77
max_supported_fwversion = (2,5,3)
max_supported_fwversion = (2,6,1)
min_fwversion_images_on_sdcard = (2,4,1)
has_kepubs = True
@ -2381,9 +2381,17 @@ class KOBOTOUCH(KOBO):
"WHERE Shelf.Name = C.ShelfName "
"AND c._IsDeleted <> 'true')")
delete_activity_query = ("DELETE FROM Activity "
"WHERE Type = 'Shelf' "
"AND NOT EXISTS "
"(SELECT 1 FROM Shelf "
"WHERE Shelf.Name = Activity.Id)"
)
cursor = connection.cursor()
cursor.execute(delete_query)
cursor.execute(update_query)
cursor.execute(delete_activity_query)
connection.commit()
cursor.close()

View File

@ -0,0 +1,22 @@
#!/usr/bin/env python
# vim:fileencoding=utf-8
from __future__ import (unicode_literals, division, absolute_import,
print_function)
__license__ = 'GPL v3'
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
class DOCXInput(InputFormatPlugin):
name = 'DOCX Input'
author = 'Kovid Goyal'
description = 'Convert DOCX files (.docx) to HTML'
file_types = set(['docx'])
recommendations = set([('page_breaks_before', '/', OptionRecommendation.MED)])
def convert(self, stream, options, file_ext, log, accelerators):
from calibre.ebooks.docx.to_html import Convert
return Convert(stream, log=log)()

View File

@ -87,7 +87,7 @@ class HTMLInput(InputFormatPlugin):
return self._is_case_sensitive
if not path or not os.path.exists(path):
return islinux or isbsd
self._is_case_sensitive = not (os.path.exists(path.lower()) \
self._is_case_sensitive = not (os.path.exists(path.lower())
and os.path.exists(path.upper()))
return self._is_case_sensitive
@ -101,6 +101,8 @@ class HTMLInput(InputFormatPlugin):
from calibre.ebooks.oeb.transforms.metadata import \
meta_info_to_oeb_metadata
from calibre.ebooks.html.input import get_filelist
from calibre.ebooks.metadata import string_to_authors
from calibre.utils.localization import canonicalize_lang
import cssutils, logging
cssutils.log.setLevel(logging.WARN)
self.OEB_STYLES = OEB_STYLES
@ -111,11 +113,20 @@ class HTMLInput(InputFormatPlugin):
metadata = oeb.metadata
meta_info_to_oeb_metadata(mi, metadata, log)
if not metadata.language:
oeb.logger.warn(u'Language not specified')
metadata.add('language', get_lang().replace('_', '-'))
l = canonicalize_lang(getattr(opts, 'language', None))
if not l:
oeb.logger.warn(u'Language not specified')
l = get_lang().replace('_', '-')
metadata.add('language', l)
if not metadata.creator:
oeb.logger.warn('Creator not specified')
metadata.add('creator', self.oeb.translate(__('Unknown')))
a = getattr(opts, 'authors', None)
if a:
a = string_to_authors(a)
if not a:
oeb.logger.warn('Creator not specified')
a = [self.oeb.translate(__('Unknown'))]
for aut in a:
metadata.add('creator', aut)
if not metadata.title:
oeb.logger.warn('Title not specified')
metadata.add('title', self.oeb.translate(__('Unknown')))
@ -175,7 +186,8 @@ class HTMLInput(InputFormatPlugin):
titles = []
headers = []
for item in self.oeb.spine:
if not item.linear: continue
if not item.linear:
continue
html = item.data
title = ''.join(xpath(html, '/h:html/h:head/h:title/text()'))
title = re.sub(r'\s+', ' ', title.strip())
@ -193,7 +205,8 @@ class HTMLInput(InputFormatPlugin):
if len(titles) > len(set(titles)):
use = headers
for title, item in izip(use, self.oeb.spine):
if not item.linear: continue
if not item.linear:
continue
toc.add(title, item.href)
oeb.container = DirContainer(os.getcwdu(), oeb.log, ignore_opf=True)
@ -291,3 +304,4 @@ class HTMLInput(InputFormatPlugin):
self.log.exception('Failed to read CSS file: %r'%link)
return (None, None)
return (None, raw)

View File

@ -87,9 +87,12 @@ def read_single_border(parent, edge):
if sz is not None:
# we dont care about art borders (they are only used for page borders)
try:
width = min(96, max(2, float(sz))) / 8
# WebKit needs at least 1pt to render borders
width = min(96, max(8, float(sz))) / 8
except (ValueError, TypeError):
pass
if style == 'double' and width is not None and 0 < width < 3:
width = 3 # WebKit needs 3pts to render double borders
return {p:v for p, v in zip(border_props, (padding, width, style, color))}
def read_border(parent, dest, border_edges=('left', 'top', 'right', 'bottom'), name='pBdr'):
@ -297,7 +300,7 @@ class ParagraphStyle(object):
# Misc.
'text_indent', 'text_align', 'line_height', 'direction', 'background_color',
'numbering', 'font_family', 'font_size', 'frame',
'numbering', 'font_family', 'font_size', 'color', 'frame',
)
def __init__(self, pPr=None):
@ -321,7 +324,7 @@ class ParagraphStyle(object):
for s in XPath('./w:pStyle[@w:val]')(pPr):
self.linked_style = get(s, 'w:val')
self.font_family = self.font_size = inherit
self.font_family = self.font_size = self.color = inherit
self._css = None
@ -365,7 +368,7 @@ class ParagraphStyle(object):
if self.line_height not in {inherit, '1'}:
c['line-height'] = self.line_height
for x in ('text_indent', 'text_align', 'background_color', 'font_family', 'font_size'):
for x in ('text_indent', 'text_align', 'background_color', 'font_family', 'font_size', 'color'):
val = getattr(self, x)
if val is not inherit:
if x == 'font_size':

View File

@ -36,7 +36,8 @@ def read_text_border(parent, dest):
if sz is not None:
# we dont care about art borders (they are only used for page borders)
try:
border_width = min(96, max(2, float(sz))) / 8
# A border of less than 1pt is not rendered by WebKit
border_width = min(96, max(8, float(sz))) / 8
except (ValueError, TypeError):
pass
@ -103,7 +104,7 @@ def read_underline(parent, dest):
for col in XPath('./w:u[@w:val]')(parent):
val = get(col, 'w:val')
if val:
ans = 'underline'
ans = val if val == 'none' else 'underline'
setattr(dest, 'text_decoration', ans)
def read_vert_align(parent, dest):
@ -116,8 +117,12 @@ def read_vert_align(parent, dest):
def read_font_family(parent, dest):
ans = inherit
for col in XPath('./w:rFonts[@w:ascii]')(parent):
val = get(col, 'w:ascii')
for col in XPath('./w:rFonts')(parent):
val = get(col, 'w:asciiTheme')
if val:
val = '|%s|' % val
else:
val = get(col, 'w:ascii')
if val:
ans = val
setattr(dest, 'font_family', ans)
@ -234,16 +239,5 @@ class RunStyle(object):
return self._css
def same_border(self, other):
for x in (self, other):
has_border = False
for y in ('color', 'style', 'width'):
if ('border-%s' % y) in x.css:
has_border = True
break
if not has_border:
return False
s = tuple(self.css.get('border-%s' % y, None) for y in ('color', 'style', 'width'))
o = tuple(other.css.get('border-%s' % y, None) for y in ('color', 'style', 'width'))
return s == o
return self.get_border_css({}) == other.get_border_css({})

View File

@ -0,0 +1,136 @@
#!/usr/bin/env python
# vim:fileencoding=utf-8
from __future__ import (unicode_literals, division, absolute_import,
print_function)
__license__ = 'GPL v3'
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
def mergeable(previous, current):
if previous.tail or current.tail:
return False
if previous.get('class', None) != current.get('class', None):
return False
if current.get('id', False):
return False
try:
return next(previous.itersiblings()) is current
except StopIteration:
return False
def append_text(parent, text):
if len(parent) > 0:
parent[-1].tail = (parent[-1].tail or '') + text
else:
parent.text = (parent.text or '') + text
def merge(parent, span):
if span.text:
append_text(parent, span.text)
for child in span:
parent.append(child)
if span.tail:
append_text(parent, span.tail)
span.getparent().remove(span)
def merge_run(run):
parent = run[0]
for span in run[1:]:
merge(parent, span)
def liftable(css):
# A <span> is liftable if all its styling would work just as well if it is
# specified on the parent element.
prefixes = {x.partition('-')[0] for x in css.iterkeys()}
return not (prefixes - {'text', 'font', 'letter', 'color', 'background'})
def add_text(elem, attr, text):
old = getattr(elem, attr) or ''
setattr(elem, attr, old + text)
def lift(span):
# Replace an element by its content (text, children and tail)
parent = span.getparent()
idx = parent.index(span)
try:
last_child = span[-1]
except IndexError:
last_child = None
if span.text:
if idx == 0:
add_text(parent, 'text', span.text)
else:
add_text(parent[idx - 1], 'tail', span.text)
for child in reversed(span):
parent.insert(idx, child)
parent.remove(span)
if span.tail:
if last_child is None:
if idx == 0:
add_text(parent, 'text', span.tail)
else:
add_text(parent[idx - 1], 'tail', span.tail)
else:
add_text(last_child, 'tail', span.tail)
def cleanup_markup(root, styles):
# Merge consecutive spans that have the same styling
current_run = []
for span in root.xpath('//span'):
if not current_run:
current_run.append(span)
else:
last = current_run[-1]
if mergeable(last, span):
current_run.append(span)
else:
if len(current_run) > 1:
merge_run(current_run)
current_run = [span]
# Remove unnecessary span tags that are the only child of a parent block
# element
class_map = dict(styles.classes.itervalues())
parents = ('p', 'div') + tuple('h%d' % i for i in xrange(1, 7))
for parent in root.xpath('//*[(%s) and count(span)=1]' % ' or '.join('name()="%s"' % t for t in parents)):
if len(parent) == 1 and not parent.text and not parent[0].tail and not parent[0].get('id', None):
# We have a block whose contents are entirely enclosed in a <span>
span = parent[0]
span_class = span.get('class', None)
span_css = class_map.get(span_class, {})
if liftable(span_css):
pclass = parent.get('class', None)
if span_class:
pclass = (pclass + ' ' + span_class) if pclass else span_class
parent.set('class', pclass)
parent.text = span.text
parent.remove(span)
for child in span:
parent.append(child)
# Make spans whose only styling is bold or italic into <b> and <i> tags
for span in root.xpath('//span[@class]'):
css = class_map.get(span.get('class', None), {})
if len(css) == 1:
if css == {'font-style':'italic'}:
span.tag = 'i'
del span.attrib['class']
elif css == {'font-weight':'bold'}:
span.tag = 'b'
del span.attrib['class']
# Get rid of <span>s that have no styling
for span in root.xpath('//span[not(@class) and not(@id)]'):
lift(span)

View File

@ -39,7 +39,7 @@ def read_doc_props(raw, mi):
for keywords in XPath('//cp:keywords')(root):
if keywords.text and keywords.text.strip():
for x in keywords.text.split():
tags.extend(y.strip() for y in x.split(','))
tags.extend(y.strip() for y in x.split(',') if y.strip())
if tags:
mi.tags = tags
authors = XPath('//dc:creator')(root)

View File

@ -15,7 +15,7 @@ from calibre.utils.zipfile import ZipFile
def dump(path):
dest = os.path.splitext(os.path.basename(path))[0]
dest += '_extracted'
dest += '-dumped'
if os.path.exists(dest):
shutil.rmtree(dest)
with ZipFile(path) as zf:

View File

@ -104,9 +104,12 @@ class Images(object):
if rid in self.used:
return self.used[rid]
raw = self.docx.read(self.rid_map[rid])
base = base or ascii_filename(self.rid_map[rid].rpartition('/')[-1]).replace(' ', '_')
base = base or ascii_filename(self.rid_map[rid].rpartition('/')[-1]).replace(' ', '_') or 'image'
ext = what(None, raw) or base.rpartition('.')[-1] or 'jpeg'
base = base.rpartition('.')[0] + '.' + ext
base = base.rpartition('.')[0]
if not base:
base = 'image'
base += '.' + ext
exists = frozenset(self.used.itervalues())
c = 1
while base in exists:
@ -132,7 +135,7 @@ class Images(object):
src = self.generate_filename(rid, name)
img = IMG(src='images/%s' % src)
if alt:
img(alt=alt)
img.set('alt', alt)
return img
def drawing_to_html(self, drawing, page):
@ -157,6 +160,17 @@ class Images(object):
ans.set('style', '; '.join('%s: %s' % (k, v) for k, v in style.iteritems()))
yield ans
def pict_to_html(self, pict, page):
for imagedata in XPath('descendant::v:imagedata[@r:id]')(pict):
rid = get(imagedata, 'r:id')
if rid in self.rid_map:
src = self.generate_filename(rid)
img = IMG(src='images/%s' % src, style="display:block")
alt = get(imagedata, 'o:title')
if alt:
img.set('alt', alt)
yield img
def get_float_properties(self, anchor, style, page):
if 'display' not in style:
style['display'] = 'block'
@ -200,6 +214,8 @@ class Images(object):
if elem.tag.endswith('}drawing'):
for tag in self.drawing_to_html(elem, page):
yield tag
# TODO: Handle w:pict
else:
for tag in self.pict_to_html(elem, page):
yield tag

View File

@ -7,7 +7,6 @@ __license__ = 'GPL v3'
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
import re
from future_builtins import map
from lxml.etree import XPath as X
@ -23,6 +22,7 @@ IMAGES = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships
LINKS = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/hyperlink'
FOOTNOTES = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/footnotes'
ENDNOTES = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/endnotes'
THEMES = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/theme'
namespaces = {
'mo': 'http://schemas.microsoft.com/office/mac/office/2008/main',
@ -84,11 +84,10 @@ def get(x, attr, default=None):
return x.attrib.get(expand(attr), default)
def ancestor(elem, name):
tag = expand(name)
while elem is not None:
elem = elem.getparent()
if getattr(elem, 'tag', None) == tag:
return elem
try:
return XPath('ancestor::%s[1]' % name)(elem)[0]
except IndexError:
return None
def generate_anchor(name, existing):
x = y = 'id_' + re.sub(r'[^0-9a-zA-Z_]', '', ascii_text(name)).lstrip('_')
@ -99,7 +98,7 @@ def generate_anchor(name, existing):
return y
def children(elem, *args):
return elem.iterchildren(*map(expand, args))
return XPath('|'.join('child::%s' % a for a in args))(elem)
def descendants(elem, *args):
return elem.iterdescendants(*map(expand, args))
return XPath('|'.join('descendant::%s' % a for a in args))(elem)

View File

@ -142,8 +142,8 @@ class Styles(object):
def get(self, key, default=None):
return self.id_map.get(key, default)
def __call__(self, root, fonts):
self.fonts = fonts
def __call__(self, root, fonts, theme):
self.fonts, self.theme = fonts, theme
for s in XPath('//w:style')(root):
s = Style(s)
if s.style_id:
@ -265,7 +265,8 @@ class Styles(object):
def resolve_run(self, r):
ans = self.run_cache.get(r, None)
if ans is None:
p = r.getparent()
p = XPath('ancestor::w:p[1]')(r)
p = p[0] if p else None
ans = self.run_cache[r] = RunStyle()
direct_formatting = None
for rPr in XPath('./w:rPr')(r):
@ -282,12 +283,16 @@ class Styles(object):
default_char = self.default_styles.get('character', None)
if self.default_character_style is not None:
parent_styles.append(self.default_character_style)
ts = self.tables.run_style(p)
if ts is not None:
parent_styles.append(ts)
pstyle = self.para_char_cache.get(p, None)
if pstyle is not None:
parent_styles.append(pstyle)
# As best as I can understand the spec, table overrides should be
# applied before paragraph overrides, but word does it
# this way, see the December 2007 table header in the demo
# document.
ts = self.tables.run_style(p)
if ts is not None:
parent_styles.append(ts)
if direct_formatting.linked_style is not None:
ls = self.get(direct_formatting.linked_style).character_style
if ls is not None:
@ -299,7 +304,8 @@ class Styles(object):
setattr(ans, attr, self.run_val(parent_styles, direct_formatting, attr))
if ans.font_family is not inherit:
ans.font_family = self.fonts.family_for(ans.font_family, ans.b, ans.i)
ff = self.theme.resolve_font_family(ans.font_family)
ans.font_family = self.fonts.family_for(ff, ans.b, ans.i)
return ans
@ -312,51 +318,63 @@ class Styles(object):
def cascade(self, layers):
self.body_font_family = 'serif'
self.body_font_size = '10pt'
self.body_color = 'black'
def promote_property(char_styles, block_style, prop):
vals = {getattr(s, prop) for s in char_styles}
if len(vals) == 1:
# All the character styles have the same value
for s in char_styles:
setattr(s, prop, inherit)
setattr(block_style, prop, next(iter(vals)))
for p, runs in layers.iteritems():
has_links = '1' in {r.get('is-link', None) for r in runs}
char_styles = [self.resolve_run(r) for r in runs]
block_style = self.resolve_paragraph(p)
c = Counter()
for prop in ('font_family', 'font_size', 'color'):
if has_links and prop == 'color':
# We cannot promote color as browser rendering engines will
# override the link color setting it to blue, unless the
# color is specified on the link element itself
continue
promote_property(char_styles, block_style, prop)
for s in char_styles:
if s.font_family is not inherit:
c[s.font_family] += 1
if s.text_decoration == 'none':
# The default text decoration is 'none'
s.text_decoration = inherit
def promote_most_common(block_styles, prop, default):
c = Counter()
for s in block_styles:
val = getattr(s, prop)
if val is not inherit:
c[val] += 1
val = None
if c:
family = c.most_common(1)[0][0]
block_style.font_family = family
for s in char_styles:
if s.font_family == family:
s.font_family = inherit
val = c.most_common(1)[0][0]
for s in block_styles:
oval = getattr(s, prop)
if oval is inherit:
if default != val:
setattr(s, prop, default)
elif oval == val:
setattr(s, prop, inherit)
return val
sizes = [s.font_size for s in char_styles if s.font_size is not inherit]
if sizes:
sz = block_style.font_size = sizes[0]
for s in char_styles:
if s.font_size == sz:
s.font_size = inherit
block_styles = tuple(self.resolve_paragraph(p) for p in layers)
block_styles = [self.resolve_paragraph(p) for p in layers]
c = Counter()
for s in block_styles:
if s.font_family is not inherit:
c[s.font_family] += 1
ff = promote_most_common(block_styles, 'font_family', self.body_font_family)
if ff is not None:
self.body_font_family = ff
if c:
self.body_font_family = family = c.most_common(1)[0][0]
for s in block_styles:
if s.font_family == family:
s.font_family = inherit
fs = promote_most_common(block_styles, 'font_size', int(self.body_font_size[:2]))
if fs is not None:
self.body_font_size = '%.3gpt' % fs
c = Counter()
for s in block_styles:
if s.font_size is not inherit:
c[s.font_size] += 1
if c:
sz = c.most_common(1)[0][0]
for s in block_styles:
if s.font_size == sz:
s.font_size = inherit
self.body_font_size = '%.3gpt' % sz
color = promote_most_common(block_styles, 'color', self.body_color)
if color is not None:
self.body_color = color
def resolve_numbering(self, numbering):
# When a numPr element appears inside a paragraph style, the lvl info
@ -398,9 +416,7 @@ class Styles(object):
ef = self.fonts.embed_fonts(dest_dir, docx)
prefix = textwrap.dedent(
'''\
body { font-family: %s; font-size: %s }
p { text-indent: 1.5em }
body { font-family: %s; font-size: %s; color: %s }
ul, ol, p { margin: 0; padding: 0 }
@ -416,7 +432,7 @@ class Styles(object):
dl.notes dd:last-of-type { page-break-after: avoid }
''') % (self.body_font_family, self.body_font_size)
''') % (self.body_font_family, self.body_font_size, self.body_color)
if ef:
prefix = ef + '\n' + prefix
@ -427,3 +443,4 @@ class Styles(object):
ans.append('.%s {\n%s\n}\n' % (cls, b.rstrip(';')))
return prefix + '\n' + '\n'.join(ans)

View File

@ -8,11 +8,14 @@ __copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
from lxml.html.builder import TABLE, TR, TD
from calibre.ebooks.docx.block_styles import inherit, read_shd, read_border, binary_property, border_props, ParagraphStyle # noqa
from calibre.ebooks.docx.block_styles import inherit, read_shd as rs, read_border, binary_property, border_props, ParagraphStyle
from calibre.ebooks.docx.char_styles import RunStyle
from calibre.ebooks.docx.names import XPath, get, is_tag
# Read from XML {{{
read_shd = rs
edges = ('left', 'top', 'right', 'bottom')
def _read_width(elem):
ans = inherit
try:
@ -44,13 +47,13 @@ def read_cell_width(parent, dest):
def read_padding(parent, dest):
name = 'tblCellMar' if parent.tag.endswith('}tblPr') else 'tcMar'
left = top = bottom = right = inherit
ans = {x:inherit for x in edges}
for mar in XPath('./w:%s' % name)(parent):
for x in ('left', 'top', 'right', 'bottom'):
for x in edges:
for edge in XPath('./w:%s' % x)(mar):
locals()[x] = _read_width(edge)
for x in ('left', 'top', 'right', 'bottom'):
setattr(dest, 'cell_padding_%s' % x, locals()[x])
ans[x] = _read_width(edge)
for x in edges:
setattr(dest, 'cell_padding_%s' % x, ans[x])
def read_justification(parent, dest):
left = right = inherit
@ -73,6 +76,12 @@ def read_spacing(parent, dest):
ans = _read_width(cs)
setattr(dest, 'spacing', ans)
def read_float(parent, dest):
ans = inherit
for x in XPath('./w:tblpPr')(parent):
ans = {k.rpartition('}')[-1]: v for k, v in x.attrib.iteritems()}
setattr(dest, 'float', ans)
def read_indent(parent, dest):
ans = inherit
for cs in XPath('./w:tblInd')(parent):
@ -139,40 +148,124 @@ def read_look(parent, dest):
# }}}
def clone(style):
ans = type(style)()
try:
ans = type(style)()
except TypeError:
return None
ans.update(style)
return ans
class RowStyle(object):
class Style(object):
def update(self, other):
for prop in self.all_properties:
nval = getattr(other, prop)
if nval is not inherit:
setattr(self, prop, nval)
def convert_spacing(self):
ans = {}
if self.spacing is not inherit:
if self.spacing in {'auto', '0'}:
ans['border-collapse'] = 'collapse'
else:
ans['border-collapse'] = 'separate'
ans['border-spacing'] = self.spacing
return ans
def convert_border(self):
c = {}
for x in edges:
for prop in border_props:
prop = prop % x
if prop.startswith('border'):
val = getattr(self, prop)
if val is not inherit:
if isinstance(val, (int, float)):
val = '%.3gpt' % val
c[prop.replace('_', '-')] = val
return c
class RowStyle(Style):
all_properties = ('height', 'cantSplit', 'hidden', 'spacing',)
def __init__(self, tcPr=None):
if tcPr is None:
for p in self.all_properties:
setattr(self, p, inherit)
else:
pass
class CellStyle(object):
all_properties = ('background_color', 'cell_padding_left', 'cell_padding_right', 'cell_padding_top',
'cell_padding_bottom', 'width', 'vertical_align', 'col_span', 'vMerge', 'hMerge',
) + tuple(k % edge for edge in border_edges for k in border_props)
def __init__(self, trPr=None):
if trPr is None:
for p in self.all_properties:
setattr(self, p, inherit)
else:
for p in ('hidden', 'cantSplit'):
setattr(self, p, binary_property(trPr, p))
for p in ('spacing', 'height'):
f = globals()['read_%s' % p]
f(trPr, self)
self._css = None
@property
def css(self):
if self._css is None:
c = self._css = {}
if self.hidden is True:
c['display'] = 'none'
if self.cantSplit is True:
c['page-break-inside'] = 'avoid'
if self.height is not inherit:
rule, val = self.height
if rule != 'auto':
try:
c['min-height' if rule == 'atLeast' else 'height'] = '%.3gpt' % (int(val)/20)
except (ValueError, TypeError):
pass
c.update(self.convert_spacing())
return self._css
class CellStyle(Style):
all_properties = ('background_color', 'cell_padding_left', 'cell_padding_right', 'cell_padding_top',
'cell_padding_bottom', 'width', 'vertical_align', 'col_span', 'vMerge', 'hMerge', 'row_span',
) + tuple(k % edge for edge in border_edges for k in border_props)
def __init__(self, tcPr=None):
if tcPr is None:
for p in self.all_properties:
setattr(self, p, inherit)
else:
for x in ('borders', 'shd', 'padding', 'cell_width', 'vertical_align', 'col_span', 'merge'):
f = globals()['read_%s' % x]
f(trPr, self)
f(tcPr, self)
self.row_span = inherit
self._css = None
class TableStyle(object):
@property
def css(self):
if self._css is None:
self._css = c = {}
if self.background_color is not inherit:
c['background-color'] = self.background_color
if self.width not in (inherit, 'auto'):
c['width'] = self.width
c['vertical-align'] = 'top' if self.vertical_align is inherit else self.vertical_align
for x in edges:
val = getattr(self, 'cell_padding_%s' % x)
if val not in (inherit, 'auto'):
c['padding-%s' % x] = val
elif val is inherit and x in {'left', 'right'}:
c['padding-%s' % x] = '%.3gpt' % (115/20)
# In Word, tables are apparently rendered with some default top and
# bottom padding irrespective of the cellMargin values. Simulate
# that here.
for x in ('top', 'bottom'):
if c.get('padding-%s' % x, '0pt') == '0pt':
c['padding-%s' % x] = '0.5ex'
c.update(self.convert_border())
return self._css
class TableStyle(Style):
all_properties = (
'width', 'cell_padding_left', 'cell_padding_right', 'cell_padding_top',
'width', 'float', 'cell_padding_left', 'cell_padding_right', 'cell_padding_top',
'cell_padding_bottom', 'margin_left', 'margin_right', 'background_color',
'spacing', 'indent', 'overrides', 'col_band_size', 'row_band_size', 'look',
) + tuple(k % edge for edge in border_edges for k in border_props)
@ -183,7 +276,7 @@ class TableStyle(object):
setattr(self, p, inherit)
else:
self.overrides = inherit
for x in ('width', 'padding', 'shd', 'justification', 'spacing', 'indent', 'borders', 'band_size', 'look'):
for x in ('width', 'float', 'padding', 'shd', 'justification', 'spacing', 'indent', 'borders', 'band_size', 'look'):
f = globals()['read_%s' % x]
f(tblPr, self)
parent = tblPr.getparent()
@ -197,17 +290,12 @@ class TableStyle(object):
for trPr in XPath('./w:trPr')(tblStylePr):
orides['row'] = RowStyle(trPr)
for tcPr in XPath('./w:tcPr')(tblStylePr):
orides['cell'] = tcPr
orides['cell'] = CellStyle(tcPr)
for pPr in XPath('./w:pPr')(tblStylePr):
orides['para'] = ParagraphStyle(pPr)
for rPr in XPath('./w:rPr')(tblStylePr):
orides['run'] = RunStyle(rPr)
def update(self, other):
for prop in self.all_properties:
nval = getattr(other, prop)
if nval is not inherit:
setattr(self, prop, nval)
self._css = None
def resolve_based_on(self, parent):
for p in self.all_properties:
@ -215,11 +303,50 @@ class TableStyle(object):
if val is inherit:
setattr(self, p, getattr(parent, p))
@property
def css(self):
if self._css is None:
c = self._css = {}
if self.width not in (inherit, 'auto'):
c['width'] = self.width
for x in ('background_color', 'margin_left', 'margin_right'):
val = getattr(self, x)
if val is not inherit:
c[x.replace('_', '-')] = val
if self.indent not in (inherit, 'auto') and self.margin_left != 'auto':
c['margin-left'] = self.indent
if self.float is not inherit:
for x in ('left', 'top', 'right', 'bottom'):
val = self.float.get('%sFromText' % x, 0)
try:
val = '%.3gpt' % (int(val) / 20)
except (ValueError, TypeError):
val = '0'
c['margin-%s' % x] = val
if 'tblpXSpec' in self.float:
c['float'] = 'right' if self.float['tblpXSpec'] in {'right', 'outside'} else 'left'
else:
page = self.page
page_width = page.width - page.margin_left - page.margin_right
try:
x = int(self.float['tblpX']) / 20
except (KeyError, ValueError, TypeError):
x = 0
c['float'] = 'left' if (x/page_width) < 0.65 else 'right'
c.update(self.convert_spacing())
if 'border-collapse' not in c:
c['border-collapse'] = 'collapse'
c.update(self.convert_border())
return self._css
class Table(object):
def __init__(self, tbl, styles, para_map):
def __init__(self, tbl, styles, para_map, is_sub_table=False):
self.tbl = tbl
self.styles = styles
self.is_sub_table = is_sub_table
# Read Table Style
style = {'table':TableStyle()}
@ -243,21 +370,33 @@ class Table(object):
style['table'].update(TableStyle(tblPr))
self.table_style, self.paragraph_style = style['table'], style.get('paragraph', None)
self.run_style = style.get('run', None)
self.overrides = self.table_style.overrides
if self.overrides is inherit:
self.overrides = {}
if 'wholeTable' in self.overrides and 'table' in self.overrides['wholeTable']:
self.table_style.update(self.overrides['wholeTable']['table'])
self.style_map = {}
self.paragraphs = []
self.cell_map = []
rows = XPath('./w:tr')(tbl)
for r, tr in enumerate(rows):
overrides = self.get_overrides(r, None, len(rows), None)
self.resolve_row_style(tr, overrides)
cells = XPath('./w:tc')(tr)
self.cell_map.append([])
for c, tc in enumerate(cells):
overrides = self.get_overrides(r, c, len(rows), len(cells))
self.resolve_cell_style(tc, overrides, r, c, len(rows), len(cells))
self.cell_map[-1].append(tc)
for p in XPath('./w:p')(tc):
para_map[p] = self
self.paragraphs.append(p)
self.resolve_para_style(p, overrides)
self.sub_tables = {x:Table(x, styles, para_map) for x in XPath('./w:tr/w:tc/w:tbl')(tbl)}
self.handle_merged_cells()
self.sub_tables = {x:Table(x, styles, para_map, is_sub_table=True) for x in XPath('./w:tr/w:tc/w:tbl')(tbl)}
def override_allowed(self, name):
'Check if the named override is allowed by the tblLook element'
@ -279,37 +418,102 @@ class Table(object):
overrides = ['wholeTable']
def divisor(m, n):
return (m - (m % n)) // n
odd_column_band = (divisor(c, self.table_style.col_band_size) % 2) == 0
overrides.append('band%dVert' % (1 if odd_column_band else 2))
odd_row_band = (divisor(r, self.table_style.row_band_size) % 2) == 0
if c is not None:
odd_column_band = (divisor(c, self.table_style.col_band_size) % 2) == 1
overrides.append('band%dVert' % (1 if odd_column_band else 2))
odd_row_band = (divisor(r, self.table_style.row_band_size) % 2) == 1
overrides.append('band%dHorz' % (1 if odd_row_band else 2))
# According to the OOXML spec columns should have higher override
# priority than rows, but Word seems to do it the other way around.
if c is not None:
if c == 0:
overrides.append('firstCol')
if c >= num_of_cols_in_row - 1:
overrides.append('lastCol')
if r == 0:
overrides.append('firstRow')
if r >= num_of_rows - 1:
overrides.append('lastRow')
if c == 0:
overrides.append('firstCol')
if c >= num_of_cols_in_row - 1:
overrides.append('lastCol')
if r == 0:
if c == 0:
overrides.append('nwCell')
if c == num_of_cols_in_row - 1:
overrides.append('neCell')
if r == num_of_rows - 1:
if c == 0:
overrides.append('swCell')
if c == num_of_cols_in_row - 1:
overrides.append('seCell')
if c is not None:
if r == 0:
if c == 0:
overrides.append('nwCell')
if c == num_of_cols_in_row - 1:
overrides.append('neCell')
if r == num_of_rows - 1:
if c == 0:
overrides.append('swCell')
if c == num_of_cols_in_row - 1:
overrides.append('seCell')
return tuple(filter(self.override_allowed, overrides))
def resolve_row_style(self, tr, overrides):
rs = RowStyle()
for o in overrides:
if o in self.overrides:
ovr = self.overrides[o]
ors = ovr.get('row', None)
if ors is not None:
rs.update(ors)
for trPr in XPath('./w:trPr')(tr):
rs.update(RowStyle(trPr))
self.style_map[tr] = rs
def resolve_cell_style(self, tc, overrides, row, col, rows, cols_in_row):
cs = CellStyle()
# from lxml.etree import tostring
# txt = tostring(tc, method='text', encoding=unicode)
for o in overrides:
if o in self.overrides:
ovr = self.overrides[o]
ors = ovr.get('cell', None)
if ors is not None:
cs.update(ors)
for tcPr in XPath('./w:tcPr')(tc):
cs.update(CellStyle(tcPr))
for x in edges:
p = 'cell_padding_%s' % x
val = getattr(cs, p)
if val is inherit:
setattr(cs, p, getattr(self.table_style, p))
is_inside_edge = (
(x == 'left' and col > 0) or
(x == 'top' and row > 0) or
(x == 'right' and col < cols_in_row - 1) or
(x == 'bottom' and row < rows -1)
)
inside_edge = ('insideH' if x in {'top', 'bottom'} else 'insideV') if is_inside_edge else None
for prop in border_props:
if not prop.startswith('border'):
continue
eprop = prop % x
iprop = (prop % inside_edge) if inside_edge else None
val = getattr(cs, eprop)
if val is inherit and iprop is not None:
# Use the insideX borders if the main cell borders are not
# specified
val = getattr(cs, iprop)
if val is inherit:
val = getattr(self.table_style, iprop)
if not is_inside_edge and val == 'none':
# Cell borders must override table borders even when the
# table border is not null and the cell border is null.
val = 'hidden'
setattr(cs, eprop, val)
self.style_map[tc] = cs
def resolve_para_style(self, p, overrides):
text_styles = [None if self.paragraph_style is None else clone(self.paragraph_style),
None if self.run_style is None else clone(self.run_style)]
text_styles = [clone(self.paragraph_style), clone(self.run_style)]
for o in overrides:
if o in self.table_style.overrides:
ovr = self.table_style.overrides[o]
if o in self.overrides:
ovr = self.overrides[o]
for i, name in enumerate(('para', 'run')):
ops = ovr.get(name, None)
if ops is not None:
@ -319,6 +523,55 @@ class Table(object):
text_styles[i].update(ops)
self.style_map[p] = text_styles
def handle_merged_cells(self):
if not self.cell_map:
return
# Handle vMerge
max_col_num = max(len(r) for r in self.cell_map)
for c in xrange(max_col_num):
cells = [row[c] if c < len(row) else None for row in self.cell_map]
runs = [[]]
for cell in cells:
try:
s = self.style_map[cell]
except KeyError: # cell is None
s = CellStyle()
if s.vMerge == 'restart':
runs.append([cell])
elif s.vMerge == 'continue':
runs[-1].append(cell)
else:
runs.append([])
for run in runs:
if len(run) > 1:
self.style_map[run[0]].row_span = len(run)
for tc in run[1:]:
tc.getparent().remove(tc)
# Handle hMerge
for cells in self.cell_map:
runs = [[]]
for cell in cells:
try:
s = self.style_map[cell]
except KeyError: # cell is None
s = CellStyle()
if s.col_span is not inherit:
runs.append([])
continue
if s.hMerge == 'restart':
runs.append([cell])
elif s.hMerge == 'continue':
runs[-1].append(cell)
else:
runs.append([])
for run in runs:
if len(run) > 1:
self.style_map[run[0]].col_span = len(run)
for tc in run[1:]:
tc.getparent().remove(tc)
def __iter__(self):
for p in self.paragraphs:
yield p
@ -326,8 +579,10 @@ class Table(object):
for p in t:
yield p
def apply_markup(self, rmap, parent=None):
def apply_markup(self, rmap, page, parent=None):
table = TABLE('\n\t\t')
self.table_style.page = page
style_map = {}
if parent is None:
try:
first_para = rmap[next(iter(self))]
@ -340,36 +595,53 @@ class Table(object):
parent.append(table)
for row in XPath('./w:tr')(self.tbl):
tr = TR('\n\t\t\t')
style_map[tr] = self.style_map[row]
tr.tail = '\n\t\t'
table.append(tr)
for tc in XPath('./w:tc')(row):
td = TD()
style_map[td] = s = self.style_map[tc]
if s.col_span is not inherit:
td.set('colspan', type('')(s.col_span))
if s.row_span is not inherit:
td.set('rowspan', type('')(s.row_span))
td.tail = '\n\t\t\t'
tr.append(td)
for x in XPath('./w:p|./w:tbl')(tc):
if x.tag.endswith('}p'):
td.append(rmap[x])
else:
self.sub_tables[x].apply_markup(rmap, parent=td)
self.sub_tables[x].apply_markup(rmap, page, parent=td)
if len(tr):
tr[-1].tail = '\n\t\t'
if len(table):
table[-1].tail = '\n\t'
table_style = self.table_style.css
if table_style:
table.set('class', self.styles.register(table_style, 'table'))
for elem, style in style_map.iteritems():
css = style.css
if css:
elem.set('class', self.styles.register(css, elem.tag))
class Tables(object):
def __init__(self):
self.tables = []
self.para_map = {}
self.sub_tables = set()
def register(self, tbl, styles):
if tbl in self.sub_tables:
return
self.tables.append(Table(tbl, styles, self.para_map))
self.sub_tables |= set(self.tables[-1].sub_tables)
def apply_markup(self, object_map):
def apply_markup(self, object_map, page_map):
rmap = {v:k for k, v in object_map.iteritems()}
for table in self.tables:
table.apply_markup(rmap)
table.apply_markup(rmap, page_map[table.tbl])
def para_style(self, p):
table = self.para_map.get(p, None)

View File

@ -0,0 +1,31 @@
#!/usr/bin/env python
# vim:fileencoding=utf-8
from __future__ import (unicode_literals, division, absolute_import,
print_function)
__license__ = 'GPL v3'
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
from calibre.ebooks.docx.names import XPath
class Theme(object):
def __init__(self):
self.major_latin_font = 'Cambria'
self.minor_latin_font = 'Calibri'
def __call__(self, root):
for fs in XPath('//a:fontScheme')(root):
for mj in XPath('./a:majorFont')(fs):
for l in XPath('./a:latin[@typeface]')(mj):
self.major_latin_font = l.get('typeface')
for mj in XPath('./a:minorFont')(fs):
for l in XPath('./a:latin[@typeface]')(mj):
self.minor_latin_font = l.get('typeface')
def resolve_font_family(self, ff):
if ff.startswith('|'):
ff = ff[1:-1]
ff = self.major_latin_font if ff.startswith('major') else self.minor_latin_font
return ff

View File

@ -16,13 +16,15 @@ from lxml.html.builder import (
from calibre.ebooks.docx.container import DOCX, fromstring
from calibre.ebooks.docx.names import (
XPath, is_tag, XML, STYLES, NUMBERING, FONTS, get, generate_anchor,
descendants, ancestor, FOOTNOTES, ENDNOTES)
descendants, FOOTNOTES, ENDNOTES, children, THEMES)
from calibre.ebooks.docx.styles import Styles, inherit, PageProperties
from calibre.ebooks.docx.numbering import Numbering
from calibre.ebooks.docx.fonts import Fonts
from calibre.ebooks.docx.images import Images
from calibre.ebooks.docx.tables import Tables
from calibre.ebooks.docx.footnotes import Footnotes
from calibre.ebooks.docx.cleanup import cleanup_markup
from calibre.ebooks.docx.theme import Theme
from calibre.ebooks.metadata.opf2 import OPFCreator
from calibre.ebooks.metadata.toc import TOC
from calibre.ebooks.oeb.polish.toc import elem_to_toc_text
@ -41,11 +43,14 @@ class Convert(object):
def __init__(self, path_or_stream, dest_dir=None, log=None, notes_text=None):
self.docx = DOCX(path_or_stream, log=log)
self.ms_pat = re.compile(r'\s{2,}')
self.ws_pat = re.compile(r'[\n\r\t]')
self.log = self.docx.log
self.notes_text = notes_text or _('Notes')
self.dest_dir = dest_dir or os.getcwdu()
self.mi = self.docx.metadata
self.body = BODY()
self.theme = Theme()
self.tables = Tables()
self.styles = Styles(self.tables)
self.images = Images()
@ -82,11 +87,13 @@ class Convert(object):
self.anchor_map = {}
self.link_map = defaultdict(list)
self.log.debug('Converting Word markup to HTML')
self.read_page_properties(doc)
for wp, page_properties in self.page_map.iteritems():
self.current_page = page_properties
p = self.convert_p(wp)
self.body.append(p)
if wp.tag.endswith('}p'):
p = self.convert_p(wp)
self.body.append(p)
notes_header = None
if self.footnotes.has_notes:
@ -103,6 +110,7 @@ class Convert(object):
for wp in note:
if wp.tag.endswith('}tbl'):
self.tables.register(wp, self.styles)
self.page_map[wp] = self.current_page
p = self.convert_p(wp)
dl[-1].append(p)
@ -110,7 +118,7 @@ class Convert(object):
self.styles.cascade(self.layers)
self.tables.apply_markup(self.object_map)
self.tables.apply_markup(self.object_map, self.page_map)
numbered = []
for html_obj, obj in self.object_map.iteritems():
@ -131,6 +139,7 @@ class Convert(object):
child.tail = '\n\t'
self.body[-1].tail = '\n'
self.log.debug('Converting styles to CSS')
self.styles.generate_classes()
for html_obj, obj in self.object_map.iteritems():
style = self.styles.resolve(obj)
@ -146,13 +155,16 @@ class Convert(object):
html_obj.set('class', cls)
if notes_header is not None:
for h in self.body.iterchildren('h1', 'h2', 'h3'):
for h in children(self.body, 'h1', 'h2', 'h3'):
notes_header.tag = h.tag
cls = h.get('class', None)
if cls and cls != 'notes-header':
notes_header.set('class', '%s notes-header' % cls)
break
self.log.debug('Cleaning up redundant markup generated by Word')
cleanup_markup(self.html, self.styles)
return self.write()
def read_page_properties(self, doc):
@ -162,6 +174,7 @@ class Convert(object):
for p in descendants(doc, 'w:p', 'w:tbl'):
if p.tag.endswith('}tbl'):
self.tables.register(p, self.styles)
current.append(p)
continue
sect = tuple(descendants(p, 'w:sectPr'))
if sect:
@ -192,6 +205,7 @@ class Convert(object):
nname = get_name(NUMBERING, 'numbering.xml')
sname = get_name(STYLES, 'styles.xml')
fname = get_name(FONTS, 'fontTable.xml')
tname = get_name(THEMES, 'theme1.xml')
foname = get_name(FOOTNOTES, 'footnotes.xml')
enname = get_name(ENDNOTES, 'endnotes.xml')
numbering = self.numbering = Numbering()
@ -220,13 +234,21 @@ class Convert(object):
else:
fonts(fromstring(raw), embed_relationships, self.docx, self.dest_dir)
if tname is not None:
try:
raw = self.docx.read(tname)
except KeyError:
self.log.warn('Styles %s do not exist' % sname)
else:
self.theme(fromstring(raw))
if sname is not None:
try:
raw = self.docx.read(sname)
except KeyError:
self.log.warn('Styles %s do not exist' % sname)
else:
self.styles(fromstring(raw), fonts)
self.styles(fromstring(raw), fonts, self.theme)
if nname is not None:
try:
@ -259,7 +281,7 @@ class Convert(object):
elem.set('id', ans)
return ans
for item in root.iterdescendants(*headings):
for item in descendants(root, *headings):
lvl = plvl = item_level_map.get(item, None)
if lvl is None:
continue
@ -305,6 +327,7 @@ class Convert(object):
current_anchor = None
current_hyperlink = None
hl_xpath = XPath('ancestor::w:hyperlink[1]')
for x in descendants(p, 'w:r', 'w:bookmarkStart', 'w:hyperlink'):
if x.tag.endswith('}r'):
@ -313,10 +336,11 @@ class Convert(object):
(dest if len(dest) == 0 else span).set('id', current_anchor)
current_anchor = None
if current_hyperlink is not None:
hl = ancestor(x, 'w:hyperlink')
if hl is not None:
try:
hl = hl_xpath(x)[0]
self.link_map[hl].append(span)
else:
x.set('is-link', '1')
except IndexError:
current_hyperlink = None
dest.append(span)
self.layers[p].append(x)
@ -359,6 +383,10 @@ class Convert(object):
wrapper = self.wrap_elems(spans, SPAN())
wrapper.set('class', cls)
if not dest.text and len(dest) == 0:
# Empty paragraph add a non-breaking space so that it is rendered
# by WebKit
dest.text = '\xa0'
return dest
def wrap_elems(self, elems, wrapper):
@ -406,8 +434,15 @@ class Convert(object):
if not child.text:
continue
space = child.get(XML('space'), None)
preserve = False
if space == 'preserve':
text.add_elem(SPAN(child.text, style="whitespace:pre-wrap"))
# Only use a <span> with white-space:pre-wrap if this element
# actually needs it, i.e. if it has more than one
# consecutive space or it has newlines or tabs.
multi_spaces = self.ms_pat.search(child.text) is not None
preserve = multi_spaces or self.ws_pat.search(child.text) is not None
if preserve:
text.add_elem(SPAN(child.text, style="white-space:pre-wrap"))
ans.append(text.elem)
else:
text.buf.append(child.text)
@ -415,7 +450,7 @@ class Convert(object):
text.add_elem(BR())
ans.append(text.elem)
elif is_tag(child, 'w:br'):
typ = child.get('type', None)
typ = get(child, 'w:type')
if typ in {'column', 'page'}:
br = BR(style='page-break-after:always')
else:
@ -437,6 +472,8 @@ class Convert(object):
l.set('class', 'noteref')
text.add_elem(l)
ans.append(text.elem)
elif is_tag(child, 'w:fldChar') and get(child, 'w:fldCharType') == 'separate':
text.buf.append('\xa0')
if text.buf:
setattr(text.elem, text.attr, ''.join(text.buf))

View File

@ -27,7 +27,7 @@ def get_metadata(stream):
width, height, fmt = identify_data(raw)
except:
continue
if 0.8 <= height/width <= 1.8 and height*width >= 12000:
if 0.8 <= height/width <= 1.8 and height*width >= 160000:
cdata = (fmt, raw)
if cdata is not None:
mi.cover_data = cdata

View File

@ -489,7 +489,7 @@ class MobiMLizer(object):
if elem.text:
if istate.preserve:
text = elem.text
elif (len(elem) > 0 and isspace(elem.text) and elem[0].tag and
elif (len(elem) > 0 and isspace(elem.text) and hasattr(elem[0].tag, 'rpartition') and
elem[0].tag.rpartition('}')[-1] not in INLINE_TAGS):
text = None
else:

View File

@ -36,7 +36,8 @@ class Header(OrderedDict):
for line in self.DEFINITION.splitlines():
line = line.strip()
if not line or line.startswith('#'): continue
if not line or line.startswith('#'):
continue
name, val = [x.strip() for x in line.partition('=')[0::2]]
if val:
val = eval(val, {'zeroes':zeroes, 'NULL':NULL, 'DYN':None,
@ -66,7 +67,7 @@ class Header(OrderedDict):
if val is None:
raise ValueError('Dynamic field %r not set'%name)
if isinstance(val, (int, long)):
fmt = 'H' if name in self.SHORT_FIELDS else 'I'
fmt = b'H' if name in self.SHORT_FIELDS else b'I'
val = pack(b'>'+fmt, val)
buf.write(val)
@ -79,8 +80,8 @@ class Header(OrderedDict):
ans = align_block(ans)
return ans
def format_value(self, name, val):
return val

View File

@ -125,7 +125,7 @@ class EbookIterator(BookmarksMixin):
[i for i in self.opf.spine if not i.is_linear]
self.spine = []
Spiny = partial(SpineItem, read_anchor_map=read_anchor_map,
run_char_count=run_char_count)
run_char_count=run_char_count, from_epub=self.book_format == 'EPUB')
is_comic = plumber.input_fmt.lower() in {'cbc', 'cbz', 'cbr', 'cb7'}
for i in ordered:
spath = i.path

View File

@ -36,14 +36,30 @@ def anchor_map(html):
class SpineItem(unicode):
def __new__(cls, path, mime_type=None, read_anchor_map=True,
run_char_count=True):
run_char_count=True, from_epub=False):
ppath = path.partition('#')[0]
if not os.path.exists(path) and os.path.exists(ppath):
path = ppath
obj = super(SpineItem, cls).__new__(cls, path)
with open(path, 'rb') as f:
raw = f.read()
raw, obj.encoding = xml_to_unicode(raw)
if from_epub:
# According to the spec, HTML in EPUB must be encoded in utf-8 or
# utf-16. Furthermore, there exist epub files produced by the usual
# incompetents that have utf-8 encoded HTML files that contain
# incorrect encoding declarations. See
# http://www.idpf.org/epub/20/spec/OPS_2.0.1_draft.htm#Section1.4.1.2
# http://www.idpf.org/epub/30/spec/epub30-publications.html#confreq-xml-enc
# https://bugs.launchpad.net/bugs/1188843
# So we first decode with utf-8 and only if that fails we try xml_to_unicode. This
# is the same algorithm as that used by the conversion pipeline (modulo
# some BOM based detection). Sigh.
try:
raw, obj.encoding = raw.decode('utf-8'), 'utf-8'
except UnicodeDecodeError:
raw, obj.encoding = xml_to_unicode(raw)
else:
raw, obj.encoding = xml_to_unicode(raw)
obj.character_count = character_count(raw) if run_char_count else 10000
obj.anchor_map = anchor_map(raw) if read_anchor_map else {}
obj.start_page = -1
@ -100,22 +116,24 @@ class IndexEntry(object):
self.end_anchor = None
def create_indexing_data(spine, toc):
if not toc: return
if not toc:
return
f = partial(IndexEntry, spine)
index_entries = list(map(f,
(t for t in toc.flat() if t is not toc),
(i-1 for i, t in enumerate(toc.flat()) if t is not toc)
))
index_entries.sort(key=attrgetter('sort_key'))
[ i.find_end(index_entries) for i in index_entries ]
[i.find_end(index_entries) for i in index_entries]
ie = namedtuple('IndexEntry', 'entry start_anchor end_anchor')
for spine_pos, spine_item in enumerate(spine):
for i in index_entries:
if i.end_spine_pos < spine_pos or i.spine_pos > spine_pos:
continue # Does not touch this file
continue # Does not touch this file
start = i.anchor if i.spine_pos == spine_pos else None
end = i.end_anchor if i.spine_pos == spine_pos else None
spine_item.index_entries.append(ie(i, start, end))

View File

@ -353,12 +353,19 @@ class FlowSplitter(object):
nix_element(elem)
# Tree 2
ancestors = frozenset(XPath('ancestor::*')(split_point2))
for elem in tuple(body2.iterdescendants()):
if elem is split_point2:
if not before:
nix_element(elem)
break
nix_element(elem, top=False)
if elem in ancestors:
# We have to preserve the ancestors as they could have CSS
# styles that are inherited/applicable, like font or
# width. So we only remove the text, if any.
elem.text = '\n'
else:
nix_element(elem, top=False)
body2.text = '\n'

View File

@ -27,7 +27,13 @@ class ProfileModel(QAbstractListModel):
if role == Qt.DisplayRole:
return QVariant(profile.name)
if role in (Qt.ToolTipRole, Qt.StatusTipRole, Qt.WhatsThisRole):
return QVariant(profile.description)
w, h = profile.screen_size
if w >= 10000:
ss = _('unlimited')
else:
ss = _('%d x %d pixels') % (w, h)
ss = _('Screen size: %s') % ss
return QVariant('%s [%s]' % (profile.description, ss))
return NONE
class PageSetupWidget(Widget, Ui_Form):

View File

@ -212,7 +212,7 @@ class StatusBar(QStatusBar): # {{{
if self.library_total != self.total:
base = _('{0}, {1} total').format(base, self.library_total)
self.defmsg.setText('%s [%s]' % (msg, base))
self.defmsg.setText(u'%s\xa0\xa0\xa0\xa0[%s]' % (msg, base))
self.clearMessage()
def device_disconnected(self):

View File

@ -7,7 +7,7 @@ __docformat__ = 'restructuredtext en'
Job management.
'''
import re
import re, time
from Queue import Empty, Queue
from PyQt4.Qt import (QAbstractTableModel, QVariant, QModelIndex, Qt,
@ -29,7 +29,7 @@ from calibre.gui2.threaded_jobs import ThreadedJobServer, ThreadedJob
from calibre.utils.search_query_parser import SearchQueryParser, ParseException
from calibre.utils.icu import lower
class JobManager(QAbstractTableModel, SearchQueryParser): # {{{
class JobManager(QAbstractTableModel, SearchQueryParser): # {{{
job_added = pyqtSignal(int)
job_done = pyqtSignal(int)
@ -55,7 +55,7 @@ class JobManager(QAbstractTableModel, SearchQueryParser): # {{{
self.timer.start(1000)
def columnCount(self, parent=QModelIndex()):
return 4
return 5
def rowCount(self, parent=QModelIndex()):
return len(self.jobs)
@ -64,11 +64,13 @@ class JobManager(QAbstractTableModel, SearchQueryParser): # {{{
if role != Qt.DisplayRole:
return NONE
if orientation == Qt.Horizontal:
if section == 0: text = _('Job')
elif section == 1: text = _('Status')
elif section == 2: text = _('Progress')
elif section == 3: text = _('Running time')
return QVariant(text)
return QVariant({
0: _('Job'),
1: _('Status'),
2: _('Progress'),
3: _('Running time'),
4: _('Start time'),
}.get(section, ''))
else:
return QVariant(section+1)
@ -117,6 +119,8 @@ class JobManager(QAbstractTableModel, SearchQueryParser): # {{{
if rtime is None:
return NONE
return QVariant('%dm %ds'%(int(rtime)//60, int(rtime)%60))
if col == 4 and job.start_time is not None:
return QVariant(time.strftime('%H:%M -- %d %b', time.localtime(job.start_time)))
if role == Qt.DecorationRole and col == 0:
state = job.run_state
if state == job.WAITING:
@ -220,7 +224,7 @@ class JobManager(QAbstractTableModel, SearchQueryParser): # {{{
def has_device_jobs(self, queued_also=False):
for job in self.jobs:
if isinstance(job, DeviceJob):
if job.duration is None: # Running or waiting
if job.duration is None: # Running or waiting
if (job.is_running or queued_also):
return True
return False
@ -341,7 +345,7 @@ class JobManager(QAbstractTableModel, SearchQueryParser): # {{{
# }}}
class FilterModel(QSortFilterProxyModel): # {{{
class FilterModel(QSortFilterProxyModel): # {{{
search_done = pyqtSignal(object)
@ -376,7 +380,7 @@ class FilterModel(QSortFilterProxyModel): # {{{
# Jobs UI {{{
class ProgressBarDelegate(QAbstractItemDelegate): # {{{
class ProgressBarDelegate(QAbstractItemDelegate): # {{{
def sizeHint(self, option, index):
return QSize(120, 30)
@ -395,7 +399,7 @@ class ProgressBarDelegate(QAbstractItemDelegate): # {{{
QApplication.style().drawControl(QStyle.CE_ProgressBar, opts, painter)
# }}}
class DetailView(QDialog, Ui_Dialog): # {{{
class DetailView(QDialog, Ui_Dialog): # {{{
def __init__(self, parent, job):
QDialog.__init__(self, parent)
@ -432,7 +436,7 @@ class DetailView(QDialog, Ui_Dialog): # {{{
self.log.appendPlainText(more.decode('utf-8', 'replace'))
# }}}
class JobsButton(QFrame): # {{{
class JobsButton(QFrame): # {{{
def __init__(self, horizontal=False, size=48, parent=None):
QFrame.__init__(self, parent)
@ -471,7 +475,6 @@ class JobsButton(QFrame): # {{{
job_manager.job_done.connect(self.job_done)
self.jobs_dialog.addAction(self.action_toggle)
def mouseReleaseEvent(self, event):
self.toggle()
@ -554,7 +557,7 @@ class JobsDialog(QDialog, Ui_JobsDialog):
try:
geom = gprefs.get('jobs_dialog_geometry', bytearray(''))
self.restoreGeometry(QByteArray(geom))
state = gprefs.get('jobs view column layout', bytearray(''))
state = gprefs.get('jobs view column layout2', bytearray(''))
self.jobs_view.horizontalHeader().restoreState(QByteArray(state))
except:
pass
@ -566,7 +569,7 @@ class JobsDialog(QDialog, Ui_JobsDialog):
def save_state(self):
try:
state = bytearray(self.jobs_view.horizontalHeader().saveState())
gprefs['jobs view column layout'] = state
gprefs['jobs view column layout2'] = state
geom = bytearray(self.saveGeometry())
gprefs['jobs_dialog_geometry'] = geom
except:
@ -640,8 +643,13 @@ class JobsDialog(QDialog, Ui_JobsDialog):
self.save_state()
return QDialog.hide(self, *args)
def reject(self):
self.save_state()
QDialog.reject(self)
def find(self, query):
self.proxy_model.find(query)
# }}}

View File

@ -22,7 +22,7 @@ from calibre.customize.ui import preferences_plugins
ICON_SIZE = 32
class StatusBar(QStatusBar): # {{{
class StatusBar(QStatusBar): # {{{
def __init__(self, parent=None):
QStatusBar.__init__(self, parent)
@ -39,7 +39,7 @@ class StatusBar(QStatusBar): # {{{
# }}}
class BarTitle(QWidget): # {{{
class BarTitle(QWidget): # {{{
def __init__(self, parent=None):
QWidget.__init__(self, parent)
@ -67,7 +67,7 @@ class BarTitle(QWidget): # {{{
# }}}
class Category(QWidget): # {{{
class Category(QWidget): # {{{
plugin_activated = pyqtSignal(object)
@ -112,7 +112,7 @@ class Category(QWidget): # {{{
# }}}
class Browser(QScrollArea): # {{{
class Browser(QScrollArea): # {{{
show_plugin = pyqtSignal(object)
@ -221,6 +221,7 @@ class Preferences(QMainWindow):
self.stack.addWidget(self.scroll_area)
self.scroll_area.setWidgetResizable(True)
self.setContextMenuPolicy(Qt.NoContextMenu)
self.bar = QToolBar(self)
self.addToolBar(self.bar)
self.bar.setVisible(False)
@ -304,7 +305,6 @@ class Preferences(QMainWindow):
self.bar.setVisible(True)
self.bb.setVisible(False)
def hide_plugin(self):
self.showing_widget = QWidget(self.scroll_area)
self.scroll_area.setWidget(self.showing_widget)
@ -355,7 +355,6 @@ class Preferences(QMainWindow):
if do_restart:
self.gui.quit(restart=True)
def cancel(self, *args):
if self.close_after_initial:
self.close()
@ -389,3 +388,4 @@ if __name__ == '__main__':
p.show()
app.exec_()
gui.shutdown()

View File

@ -26,6 +26,9 @@ from calibre.utils.filenames import ascii_filename
class SearchDialog(QDialog, Ui_Dialog):
SEARCH_TEXT = _('&Search')
STOP_TEXT = _('&Stop')
def __init__(self, gui, parent=None, query=''):
QDialog.__init__(self, parent)
self.setupUi(self)
@ -89,7 +92,7 @@ class SearchDialog(QDialog, Ui_Dialog):
self.configure.setIcon(QIcon(I('config.png')))
self.adv_search_button.clicked.connect(self.build_adv_search)
self.search.clicked.connect(self.do_search)
self.search.clicked.connect(self.toggle_search)
self.checker.timeout.connect(self.get_results)
self.progress_checker.timeout.connect(self.check_progress)
self.results_view.activated.connect(self.result_item_activated)
@ -101,6 +104,7 @@ class SearchDialog(QDialog, Ui_Dialog):
self.select_none_stores.clicked.connect(self.stores_select_none)
self.configure.clicked.connect(self.do_config)
self.finished.connect(self.dialog_closed)
self.searching = False
self.progress_checker.start(100)
@ -161,6 +165,18 @@ class SearchDialog(QDialog, Ui_Dialog):
# Affiliate
self.results_view.setColumnWidth(6, 20)
def toggle_search(self):
if self.searching:
self.search_pool.abort()
m = self.results_view.model()
m.details_pool.abort()
m.cover_pool.abort()
self.search.setText(self.SEARCH_TEXT)
self.checker.stop()
self.searching = False
else:
self.do_search()
def do_search(self):
# Stop all running threads.
self.checker.stop()
@ -182,6 +198,8 @@ class SearchDialog(QDialog, Ui_Dialog):
_('You must enter a title, author or keyword to'
' search for.'), show=True)
return
self.searching = True
self.search.setText(self.STOP_TEXT)
# Give the query to the results model so it can do
# futher filtering.
self.results_view.model().set_query(query)
@ -198,7 +216,7 @@ class SearchDialog(QDialog, Ui_Dialog):
query = self.clean_query(query)
shuffle(store_names)
# Add plugins that the user has checked to the search pool's work queue.
self.gui.istores.join(4.0) # Wait for updated plugins to load
self.gui.istores.join(4.0) # Wait for updated plugins to load
for n in store_names:
if self.store_checks[n].isChecked():
self.search_pool.add_task(query, n, self.gui.istores[n], self.max_results, self.timeout)
@ -387,9 +405,15 @@ class SearchDialog(QDialog, Ui_Dialog):
self.gui.istores[result.store_name].open(self, result.detail_item, self.open_external.isChecked())
def check_progress(self):
if not self.search_pool.threads_running() and not self.results_view.model().cover_pool.threads_running() and not self.results_view.model().details_pool.threads_running():
m = self.results_view.model()
if not self.search_pool.threads_running() and not m.cover_pool.threads_running() and not m.details_pool.threads_running():
self.pi.stopAnimation()
self.search.setText(self.SEARCH_TEXT)
self.searching = False
else:
self.searching = True
if unicode(self.search.text()) != self.STOP_TEXT:
self.search.setText(self.STOP_TEXT)
if not self.pi.isAnimated():
self.pi.startAnimation()
@ -427,3 +451,4 @@ if __name__ == '__main__':
s = SearchDialog(gui, query=' '.join(sys.argv[1:]))
s.exec_()

View File

@ -15,6 +15,7 @@ from PyQt4.Qt import (QFont, QVariant, QDialog, Qt, QColor, QColorDialog,
from calibre.constants import iswindows, isxp
from calibre.utils.config import Config, StringConfig, JSONConfig
from calibre.gui2 import min_available_height
from calibre.gui2.shortcuts import ShortcutConfig
from calibre.gui2.viewer.config_ui import Ui_Dialog
from calibre.utils.localization import get_language
@ -140,6 +141,7 @@ class ConfigDialog(QDialog, Ui_Dialog):
self.init_load_themes()
self.clear_search_history_button.clicked.connect(self.clear_search_history)
self.resize(self.width(), min(self.height(), max(575, min_available_height()-25)))
def clear_search_history(self):
from calibre.gui2 import config

View File

@ -29,7 +29,7 @@ from calibre.ebooks.oeb.display.webview import load_html
from calibre.constants import isxp, iswindows
# }}}
class Document(QWebPage): # {{{
class Document(QWebPage): # {{{
page_turn = pyqtSignal(object)
mark_element = pyqtSignal(QWebElement)
@ -356,7 +356,8 @@ class Document(QWebPage): # {{{
self.mainFrame().setScrollPosition(QPoint(x, y))
def jump_to_anchor(self, anchor):
if not self.loaded_javascript: return
if not self.loaded_javascript:
return
self.javascript('window.paged_display.jump_to_anchor("%s")'%anchor)
def element_ypos(self, elem):
@ -447,7 +448,7 @@ class Document(QWebPage): # {{{
@property
def width(self):
return self.mainFrame().contentsSize().width() # offsetWidth gives inaccurate results
return self.mainFrame().contentsSize().width() # offsetWidth gives inaccurate results
def set_bottom_padding(self, amount):
s = QSize(-1, -1) if amount == 0 else QSize(self.viewportSize().width(),
@ -460,7 +461,7 @@ class Document(QWebPage): # {{{
# }}}
class DocumentView(QWebView): # {{{
class DocumentView(QWebView): # {{{
magnification_changed = pyqtSignal(object)
DISABLED_BRUSH = QBrush(Qt.lightGray, Qt.Dense5Pattern)
@ -766,8 +767,10 @@ class DocumentView(QWebView): # {{{
@dynamic_property
def current_language(self):
def fget(self): return self.document.current_language
def fset(self, val): self.document.current_language = val
def fget(self):
return self.document.current_language
def fset(self, val):
self.document.current_language = val
return property(fget=fget, fset=fset)
def search(self, text, backwards=False):
@ -816,7 +819,6 @@ class DocumentView(QWebView): # {{{
self.scrollbar.blockSignals(False)
self._ignore_scrollbar_signals = False
def load_finished(self, ok):
if self.loading_url is None:
# An <iframe> finished loading
@ -960,8 +962,8 @@ class DocumentView(QWebView): # {{{
window_height = self.document.window_height
document_height = self.document.height
ddelta = document_height - window_height
#print '\nWindow height:', window_height
#print 'Document height:', self.document.height
# print '\nWindow height:', window_height
# print 'Document height:', self.document.height
delta_y = window_height - 25
if self.document.at_bottom or ddelta <= 0:
@ -974,19 +976,19 @@ class DocumentView(QWebView): # {{{
return
else:
oopos = self.document.ypos
#print 'Original position:', oopos
# print 'Original position:', oopos
self.document.set_bottom_padding(0)
opos = self.document.ypos
#print 'After set padding=0:', self.document.ypos
# print 'After set padding=0:', self.document.ypos
if opos < oopos:
if self.manager is not None:
if epf:
self.flipper.initialize(self.current_page_image())
self.manager.next_document()
return
#oheight = self.document.height
lower_limit = opos + delta_y # Max value of top y co-ord after scrolling
max_y = self.document.height - window_height # The maximum possible top y co-ord
# oheight = self.document.height
lower_limit = opos + delta_y # Max value of top y co-ord after scrolling
max_y = self.document.height - window_height # The maximum possible top y co-ord
if max_y < lower_limit:
padding = lower_limit - max_y
if padding == window_height:
@ -995,28 +997,28 @@ class DocumentView(QWebView): # {{{
self.flipper.initialize(self.current_page_image())
self.manager.next_document()
return
#print 'Setting padding to:', lower_limit - max_y
# print 'Setting padding to:', lower_limit - max_y
self.document.set_bottom_padding(lower_limit - max_y)
if epf:
self.flipper.initialize(self.current_page_image())
#print 'Document height:', self.document.height
#print 'Height change:', (self.document.height - oheight)
# print 'Document height:', self.document.height
# print 'Height change:', (self.document.height - oheight)
max_y = self.document.height - window_height
lower_limit = min(max_y, lower_limit)
#print 'Scroll to:', lower_limit
# print 'Scroll to:', lower_limit
if lower_limit > opos:
self.document.scroll_to(self.document.xpos, lower_limit)
actually_scrolled = self.document.ypos - opos
#print 'After scroll pos:', self.document.ypos
#print 'Scrolled by:', self.document.ypos - opos
# print 'After scroll pos:', self.document.ypos
# print 'Scrolled by:', self.document.ypos - opos
self.find_next_blank_line(window_height - actually_scrolled)
#print 'After blank line pos:', self.document.ypos
# print 'After blank line pos:', self.document.ypos
if epf:
self.flipper(self.current_page_image(),
duration=self.document.page_flip_duration)
if self.manager is not None:
self.manager.scrolled(self.scroll_fraction)
#print 'After all:', self.document.ypos
# print 'After all:', self.document.ypos
def page_turn_requested(self, backwards):
if backwards:
@ -1110,7 +1112,8 @@ class DocumentView(QWebView): # {{{
return
if self.document.in_paged_mode:
if abs(event.delta()) < 15: return
if abs(event.delta()) < 15:
return
typ = 'screen' if self.document.wheel_flips_pages else 'col'
direction = 'next' if event.delta() < 0 else 'previous'
loc = self.document.javascript('paged_display.%s_%s_location()'%(
@ -1134,7 +1137,7 @@ class DocumentView(QWebView): # {{{
event.accept()
return
if self.document.at_bottom:
self.scroll_by(y=15) # at_bottom can lie on windows
self.scroll_by(y=15) # at_bottom can lie on windows
if self.manager is not None:
self.manager.next_document()
event.accept()
@ -1218,6 +1221,12 @@ class DocumentView(QWebView): # {{{
self.paged_col_scroll()
else:
self.scroll_by(x=15)
elif key == 'Back':
if self.manager is not None:
self.manager.back(None)
elif key == 'Forward':
if self.manager is not None:
self.manager.forward(None)
else:
handled = False
return handled
@ -1256,3 +1265,4 @@ class DocumentView(QWebView): # {{{
# }}}

View File

@ -44,4 +44,10 @@ SHORTCUTS = {
'Right' : (['L', 'Right'],
_('Scroll right')),
'Back': (['Alt+Left'],
_('Back')),
'Forward': (['Alt+Right'],
_('Forward')),
}

View File

@ -706,6 +706,8 @@ class CatalogBuilder(object):
if last_c in exceptions.keys():
last_c = exceptions[unicode(last_c)]
last_ordnum = ordnum
else:
last_c = cl_list[idx-1]
cl_list[idx] = last_c
if self.DEBUG and self.opts.verbose:

View File

@ -7,7 +7,7 @@ import sys, os, cPickle, textwrap, stat
from subprocess import check_call
from functools import partial
from calibre import __appname__, prints, guess_type
from calibre import __appname__, prints, guess_type
from calibre.constants import islinux, isnetbsd, isbsd
from calibre.customize.ui import all_input_formats
from calibre.ptempfile import TemporaryDirectory
@ -15,7 +15,7 @@ from calibre import CurrentDir
entry_points = {
'console_scripts': [ \
'console_scripts': [
'ebook-device = calibre.devices.cli:main',
'ebook-meta = calibre.ebooks.metadata.cli:main',
'ebook-convert = calibre.ebooks.conversion.cli:main',
@ -123,7 +123,7 @@ os.remove(os.path.abspath(__file__))
# }}}
class ZshCompleter(object): # {{{
class ZshCompleter(object): # {{{
def __init__(self, opts):
self.opts = opts
@ -196,7 +196,8 @@ class ZshCompleter(object): # {{{
def opts_and_exts(self, name, op, exts, cover_opts=('--cover',),
opf_opts=('--opf',), file_map={}):
if not self.dest: return
if not self.dest:
return
exts = set(exts).union(x.upper() for x in exts)
pats = ('*.%s'%x for x in exts)
extra = ("'*:filename:_files -g \"%s\"' "%' '.join(pats),)
@ -206,7 +207,8 @@ class ZshCompleter(object): # {{{
self.commands[name] = txt
def opts_and_words(self, name, op, words, takes_files=False):
if not self.dest: return
if not self.dest:
return
extra = ("'*:filename:_files' ",) if takes_files else ()
opts = '\\\n '.join(tuple(self.get_options(op())) + extra)
txt = '_arguments -s \\\n ' + opts
@ -273,7 +275,8 @@ class ZshCompleter(object): # {{{
):
for fmt in fmts:
is_input = group_title == input_group
if is_input and fmt in {'rar', 'zip', 'oebzip'}: continue
if is_input and fmt in {'rar', 'zip', 'oebzip'}:
continue
p = (get_parser(input_fmt=fmt) if is_input
else get_parser(output_fmt=fmt))
opts = None
@ -282,7 +285,8 @@ class ZshCompleter(object): # {{{
opts = [o for o in group.option_list if
'--pretty-print' not in o._long_opts and
'--input-encoding' not in o._long_opts]
if not opts: continue
if not opts:
continue
opts = '\\\n '.join(tuple(self.get_options(opts)))
w('\n%s() {'%(func%fmt))
w('\n _arguments -s \\\n ' + opts)
@ -407,7 +411,6 @@ class PostInstall:
self.warnings.append((args, kwargs))
sys.stdout.flush()
def __init__(self, opts, info=prints, warn=None, manifest=None):
self.opts = opts
self.info = info
@ -482,8 +485,7 @@ class PostInstall:
raise
self.task_failed('Creating uninstaller failed')
def setup_completion(self): # {{{
def setup_completion(self): # {{{
try:
self.info('Setting up command-line completion...')
from calibre.ebooks.metadata.cli import option_parser as metaop, filetypes as meta_filetypes
@ -542,7 +544,7 @@ class PostInstall:
o_and_w('fetch-ebook-metadata', fem_op, [])
o_and_w('calibre-smtp', smtp_op, [])
o_and_w('calibre-server', serv_op, [])
o_and_e('calibre-debug', debug_op, ['py', 'recipe'], file_map={
o_and_e('calibre-debug', debug_op, ['py', 'recipe', 'mobi', 'azw', 'azw3', 'docx'], file_map={
'--tweak-book':['epub', 'azw3', 'mobi'],
'--subset-font':['ttf', 'otf'],
'--exec-file':['py', 'recipe'],
@ -636,7 +638,7 @@ class PostInstall:
self.task_failed('Setting up completion failed')
# }}}
def setup_desktop_integration(self): # {{{
def setup_desktop_integration(self): # {{{
try:
self.info('Setting up desktop integration...')
@ -745,7 +747,7 @@ def opts_and_words(name, op, words, takes_files=False):
opts = '|'.join(options(op))
words = '|'.join([w.replace("'", "\\'") for w in words])
fname = name.replace('-', '_')
return ('_'+fname+'()'+\
return ('_'+fname+'()'+
'''
{
local cur opts
@ -922,3 +924,4 @@ def main():
if __name__ == '__main__':
sys.exit(main())

View File

@ -0,0 +1,408 @@
#!/usr/bin/env python
# vim:fileencoding=utf-8
from __future__ import (unicode_literals, division, absolute_import,
print_function)
__license__ = 'GPL v3'
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
import os, re
from io import BytesIO
from functools import partial
from calibre import force_unicode, walk
from calibre.constants import __appname__
from calibre.web.feeds import feeds_from_index
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.web.fetch.javascript import fetch_page, AbortFetch, links_from_selectors
from calibre.ebooks.chardet import xml_to_unicode, strip_encoding_declarations
def image_data_to_url(data, base='cover'):
from calibre.utils.imghdr import what
ans = BytesIO(data)
ext = what(None, data)
if not ext:
if data.startswith(b'%PDF-'):
ext = 'pdf'
else:
ext = 'jpg'
ans.name = 'cover.' + ext
return ans
class JavascriptRecipe(BasicNewsRecipe):
'''
This recipe class is used to download content from javascript heavy
sites. It uses a full WebKit browser to do the downloading, therefore it
can support sites that use javascript to dynamically fetch content.
Most of the parameters from :class:`BasicNewsRecipe` still apply, apart
from those noted specifically below. The biggest difference is that you use
CSS selectors to specify tags to keep and remove as well as links to
follow, instead of the BeautifulSoup selectors used in
:class:`BasicNewsRecipe`. Indeed, BeautifulSoup has been completely removed
and replaced by lxml, whereever you previously expected BeautifulSoup to
represent parsed HTML, you will now get lxml trees. See
http://lxml.de/tutorial.html for a tutorial on using lxml.
The various article pre-processing callbacks such as ``preprocess_html()``
and ``skip_ad_pages()`` have all been replaced by just two callbacks,
:meth:`preprocess_stage1` and :meth:`preprocess_stage2`. These methods are
a passed the browser instance, and can thus do anything they like.
An important method that you will often have to implement is
:meth:`load_complete` to tell the download system when a page has finished
loading and is ready to be scraped.
You can use the builtin recipe for time.com as an example of the usage of
this class.
'''
#: Minimum calibre version needed to use this recipe
requires_version = (0, 9, 35)
#: List of tags to be removed. Specified tags are removed from downloaded HTML.
#: A tag is specified using CSS selectors.
#: A common example::
#:
#: remove_tags = ['div.advert', 'div.tools']
#:
#: This will remove all `<div class="advert">` and `<div class="tools">` tags and all
#: their children from the downloaded :term:`HTML`.
remove_tags = ()
#: Remove all tags that occur after the specified tag.
#: A tag is specified using CSS selectors.
#: For example::
#:
# : remove_tags_after = '#content'
#:
#: will remove all tags after the first element with `id="content"`.
remove_tags_after = None
#: Remove all tags that occur before the specified tag.
#: A tag is specified using CSS selectors.
#: For example::
#:
# : remove_tags_before = '#content'
#:
#: will remove all tags before the first element with `id="content"`.
remove_tags_before = None
#: Keep only the specified tags and their children.
#: Uses the CSS selector syntax.
#: If this list is not empty, then the `<body>` tag will be emptied and re-filled with
#: the tags that match the entries in this list. For example::
#:
# : keep_only_tags = ['#content', '#heading']
#:
#: will keep only tags that have an `id` attribute of `"content"` or `"heading"`.
keep_only_tags = ()
#: A list of selectors that match <a href> elements that you want followed.
#: For this to work you must also set recursions to at least 1.
#: You can get more control by re-implemnting :met:`select_links` in your sub-class.
links_from_selectors = ()
def select_links(self, browser, url, recursion_level):
'''
Override this method in your recipe to implement arbitrary link following logic. It must return a
list of URLs, each of which will be downloaded in turn.
'''
return links_from_selectors(self.links_from_selectors, self.recursions, browser, url, recursion_level)
def get_jsbrowser(self, *args, **kwargs):
'''
Override this method in your recipe if you want to use a non-standard Browser object.
'''
from calibre.web.jsbrowser.browser import Browser
return Browser(default_timeout=kwargs.get('default_timeout', self.timeout))
def do_login(self, browser, username, password):
'''
This method is used to login to a website that uses a paywall. Implement it in
your recipe if the site uses a paywall. An example implementation::
def do_login(self, browser, username, password):
browser.visit('http://some-page-that-has-a-login')
form = browser.select_form(nr=0) # Select the first form on the page
form['username'] = username
form['password'] = password
browser.submit(timeout=120) # Submit the form and wait at most two minutes for loading to complete
Note that you can also select forms with CSS2 selectors, like this::
browser.select_form('form#login_form')
browser.select_from('form[name="someform"]')
'''
pass
def get_publication_data(self, browser):
'''
Download the cover, the masthead image and the list of sections/articles.
Should return a dictionary with keys 'index', 'cover' and 'masthead'.
'cover' and 'masthead' are optional, if not present, they will be auto-generated.
The index must be in the same format as described in :meth:`parse_index`.
'''
raise NotImplementedError('You must implement this method in your recipe')
def load_complete(self, browser, url, recursion_level):
'''
This method is called after every page on the website is loaded. To be
precise, it is called when the DOM is ready. If further checks need to
be made, they should be made here. For example, if you want to check
that some element in the DOM is present, you would use::
def load_complete(self, browser, url, rl):
browser.wait_for_element('#article-footer')
return True
where article-footer is the id of the element you want to wait for.
'''
return True
def abort_article(self, msg=None):
'''
Call this method in any article processing callback to abort the download of the article.
For example::
def postprocess_html(self, article, root, url, recursion_level):
if '/video/' in url:
self.abort_article()
return root
This will cause this article to be ignored.
'''
raise AbortFetch(msg or 'Article fetch aborted')
def preprocess_stage1(self, article, browser, url, recursion_level):
'''
This method is a callback called for every downloaded page, before any cleanup is done.
'''
pass
def preprocess_stage2(self, article, browser, url, recursion_level):
'''
This method is a callback called for every downloaded page, after the cleanup is done.
'''
pass
def postprocess_html(self, article, root, url, recursion_level):
'''
This method is called with the downloaded html for every page as an lxml
tree. It is called after all cleanup and related processing is completed.
You can use it to perform any extra cleanup,or to abort the article
download (see :meth:`abort_article`).
:param article: The Article object, which represents the article being currently downloaded
:param root: The parsed downloaded HTML, as an lxml tree, see http://lxml.de/tutorial.html
for help with using lxml to manipulate HTML.
:param url: The URL from which this HTML was downloaded
:param recursion_level: This is zero for the first page in an article and > 0 for subsequent pages.
'''
return root
def index_to_soup(self, url_or_raw, raw=False):
'''
Convenience method that takes an URL to the index page and returns
a parsed lxml tree representation of it. See http://lxml.de/tutorial.html
`url_or_raw`: Either a URL or the downloaded index page as a string
'''
if re.match(r'\w+://', url_or_raw):
self.jsbrowser.start_load(url_or_raw)
html = self.jsbrowser.html
else:
html = url_or_raw
if isinstance(html, bytes):
html = xml_to_unicode(html)[0]
html = strip_encoding_declarations(html)
if raw:
return html
import html5lib
root = html5lib.parse(html, treebuilder='lxml', namespaceHTMLElements=False).getroot()
return root
# ***************************** Internal API *****************************
def _preprocess_browser(self, article, browser, url, stage, recursion_level):
func = getattr(self, 'preprocess_stage%d' % stage)
return func(article, browser, url, recursion_level)
def _postprocess_html(self, article, feed_num, art_num, feed_len, root, url, recursion_level):
from lxml.html.builder import STYLE
if self.no_stylesheets:
for link in root.xpath('//link[@href]'):
if (link.get('type', '') or 'text/css'):
link.getparent().remove(link)
for style in root.xpath('//style'):
style.getparent().remove(style)
# Add recipe specific styling
head = root.xpath('//head|//body')
head = head[0] if head else next(root.iterdescendants())
head.append(STYLE(self.template_css + '\n\n' + (self.extra_css or '') + '\n'))
# Add the top navbar
if recursion_level == 0:
body = root.xpath('//body')
if body:
templ = self.navbar.generate(
False, feed_num, art_num, feed_len, not self.has_single_feed, url,
__appname__, center=self.center_navbar,
extra_css=self.extra_css)
body[0].insert(0, templ.root.xpath('//div')[0])
# Remove javascript
remove_attrs = set(self.remove_attributes)
if self.remove_javascript:
remove_attrs.add('onload')
for script in root.xpath('//*[name()="script" or name()="noscript"]'):
script.getparent().remove(script)
# Remove specified attributes
for attr in remove_attrs:
for tag in root.xpath('//*[@%s]' % attr):
tag.attrib.pop(attr, None)
# Remove tags that cause problems on ebook devices
nuke = ['base', 'iframe', 'canvas', 'embed', 'command', 'datalist', 'video', 'audio', 'form']
for tag in root.xpath('|'.join('//%s' % tag for tag in nuke)):
tag.getparent().remove(tag)
root = self.postprocess_html(article, root, url, recursion_level)
if root is not None:
# Nuke HTML5 tags
tags = ['article', 'aside', 'header', 'footer', 'nav', 'figcaption', 'figure', 'section']
for tag in root.xpath('|'.join('//%s' % tag for tag in tags)):
tag.tag = 'div'
self.populate_article_metadata(article, root, recursion_level == 0)
return root
def download(self):
browser = self.jsbrowser = self.get_jsbrowser()
with browser:
try:
if self.needs_subscription and self.username and self.password:
self.do_login(browser, self.username, self.password)
data = self.get_publication_data(browser)
# Process cover, if any
cdata = data.get('cover', None)
if cdata:
self.cover_url = image_data_to_url(cdata)
self.download_cover()
# Process masthead, if any
mdata = data.get('masthead', None)
if mdata:
self.masthead_url = image_data_to_url(mdata)
self.resolve_masthead()
# Process the list of sections/articles
return self.build_index(data, browser)
finally:
self.cleanup()
def build_index(self, data, browser):
sections = data.get('index', None)
if not sections:
raise ValueError('No articles found, aborting')
feeds = feeds_from_index(sections, oldest_article=self.oldest_article,
max_articles_per_feed=self.max_articles_per_feed,
log=self.log)
if not feeds:
raise ValueError('No articles found, aborting')
if self.ignore_duplicate_articles is not None:
feeds = self.remove_duplicate_articles(feeds)
if self.test:
feeds = feeds[:2]
self.has_single_feed = len(feeds) == 1
index = os.path.join(self.output_dir, 'index.html')
html = self.feeds2index(feeds)
with open(index, 'wb') as fi:
fi.write(html)
if self.reverse_article_order:
for feed in feeds:
if hasattr(feed, 'reverse'):
feed.reverse()
self.report_progress(0, _('Got feeds from index page'))
resource_cache = {}
total = 0
for feed in feeds:
total += min(self.max_articles_per_feed, len(feed))
num = 0
for f, feed in enumerate(feeds):
feed_dir = os.path.join(self.output_dir, 'feed_%d'%f)
if not os.path.isdir(feed_dir):
os.makedirs(feed_dir)
for a, article in enumerate(feed):
if a >= self.max_articles_per_feed:
break
num += 1
art_dir = os.path.join(feed_dir, 'article_%d'%a)
if not os.path.isdir(art_dir):
os.makedirs(art_dir)
try:
url = self.print_version(article.url)
except NotImplementedError:
url = article.url
except:
self.log.exception('Failed to find print version for: '+article.url)
url = None
if not url:
continue
self.log.debug('Downloading article:', article.title, 'from', url)
try:
pages = fetch_page(
url,
load_complete=self.load_complete,
links=self.select_links,
remove=self.remove_tags,
keep_only=self.keep_only_tags,
preprocess_browser=partial(self._preprocess_browser, article),
postprocess_html=partial(self._postprocess_html, article, f, a, len(feed)),
remove_before=self.remove_tags_before,
remove_after=self.remove_tags_after,
remove_javascript=self.remove_javascript,
delay=self.delay,
resource_cache=resource_cache, output_dir=art_dir, browser=browser)
except AbortFetch:
self.log.exception('Fetching of article: %r aborted' % article.title)
continue
except Exception:
self.log.exception('Fetching of article: %r failed' % article.title)
continue
self.log.debug('Downloaded article:', article.title, 'from', article.url)
article.orig_url = article.url
article.url = 'article_%d/index.html'%a
article.downloaded = True
article.sub_pages = pages[1:]
self.report_progress(float(num)/total,
_(u'Article downloaded: %s')%force_unicode(article.title))
for f, feed in enumerate(feeds):
html = self.feed2index(f, feeds)
feed_dir = os.path.join(self.output_dir, 'feed_%d'%f)
with open(os.path.join(feed_dir, 'index.html'), 'wb') as fi:
fi.write(html)
if self.no_stylesheets:
for f in walk(self.output_dir):
if f.endswith('.css'):
os.remove(f)
self.create_opf(feeds)
self.report_progress(1, _('Download finished'))
return index

View File

@ -1160,26 +1160,7 @@ class BasicNewsRecipe(Recipe):
self.report_progress(0, _('Trying to download cover...'))
self.download_cover()
self.report_progress(0, _('Generating masthead...'))
self.masthead_path = None
try:
murl = self.get_masthead_url()
except:
self.log.exception('Failed to get masthead url')
murl = None
if murl is not None:
# Try downloading the user-supplied masthead_url
# Failure sets self.masthead_path to None
self.download_masthead(murl)
if self.masthead_path is None:
self.log.info("Synthesizing mastheadImage")
self.masthead_path = os.path.join(self.output_dir, 'mastheadImage.jpg')
try:
self.default_masthead_image(self.masthead_path)
except:
self.log.exception('Failed to generate default masthead image')
self.masthead_path = None
self.resolve_masthead()
if self.test:
feeds = feeds[:2]
@ -1268,7 +1249,10 @@ class BasicNewsRecipe(Recipe):
if not cu:
return
cdata = None
if os.access(cu, os.R_OK):
if hasattr(cu, 'read'):
cdata = cu.read()
cu = getattr(cu, 'name', 'cover.jpg')
elif os.access(cu, os.R_OK):
cdata = open(cu, 'rb').read()
else:
self.report_progress(1, _('Downloading cover from %s')%cu)
@ -1305,13 +1289,19 @@ class BasicNewsRecipe(Recipe):
self.cover_path = None
def _download_masthead(self, mu):
ext = mu.rpartition('.')[-1]
if '?' in ext:
ext = ''
if hasattr(mu, 'rpartition'):
ext = mu.rpartition('.')[-1]
if '?' in ext:
ext = ''
else:
ext = mu.name.rpartition('.')[-1]
ext = ext.lower() if ext else 'jpg'
mpath = os.path.join(self.output_dir, 'masthead_source.'+ext)
outfile = os.path.join(self.output_dir, 'mastheadImage.jpg')
if os.access(mu, os.R_OK):
if hasattr(mu, 'read'):
with open(mpath, 'wb') as mfile:
mfile.write(mu.read())
elif os.access(mu, os.R_OK):
with open(mpath, 'wb') as mfile:
mfile.write(open(mu, 'rb').read())
else:
@ -1329,6 +1319,27 @@ class BasicNewsRecipe(Recipe):
except:
self.log.exception("Failed to download supplied masthead_url")
def resolve_masthead(self):
self.masthead_path = None
try:
murl = self.get_masthead_url()
except:
self.log.exception('Failed to get masthead url')
murl = None
if murl is not None:
# Try downloading the user-supplied masthead_url
# Failure sets self.masthead_path to None
self.download_masthead(murl)
if self.masthead_path is None:
self.log.info("Synthesizing mastheadImage")
self.masthead_path = os.path.join(self.output_dir, 'mastheadImage.jpg')
try:
self.default_masthead_image(self.masthead_path)
except:
self.log.exception('Failed to generate default masthead image')
self.masthead_path = None
def default_cover(self, cover_file):
'''
Create a generic cover for recipes that dont have a cover

View File

@ -7,11 +7,12 @@ Builtin recipes.
import re, time, io
from calibre.web.feeds.news import (BasicNewsRecipe, CustomIndexRecipe,
AutomaticNewsRecipe, CalibrePeriodical)
from calibre.web.feeds.jsnews import JavascriptRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup
from calibre.utils.config import JSONConfig
basic_recipes = (BasicNewsRecipe, AutomaticNewsRecipe, CustomIndexRecipe,
CalibrePeriodical)
CalibrePeriodical, JavascriptRecipe)
custom_recipes = JSONConfig('custom_recipes/index.json')

View File

@ -0,0 +1,262 @@
#!/usr/bin/env python
# vim:fileencoding=utf-8
from __future__ import (unicode_literals, division, absolute_import,
print_function)
__license__ = 'GPL v3'
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
import time, os, hashlib
from operator import attrgetter
from collections import defaultdict
from functools import partial
from calibre import jsbrowser
from calibre.ebooks.chardet import strip_encoding_declarations
from calibre.utils.imghdr import what
from calibre.web.jsbrowser.browser import Timeout
# remove_comments() {{{
remove_comments = '''
function remove_comments(node) {
var nodes = node.childNodes, i=0, t;
while((t = nodes.item(i++))) {
switch(t.nodeType){
case Node.ELEMENT_NODE:
remove_comments(t);
break;
case Node.COMMENT_NODE:
node.removeChild(t);
i--;
}
}
}
remove_comments(document)
''' # }}}
class AbortFetch(ValueError):
pass
def children(elem):
elem = elem.firstChild()
while not elem.isNull():
yield elem
elem = elem.nextSibling()
def apply_keep_only(browser, keep_only):
mf = browser.page.mainFrame()
body = mf.findFirstElement('body')
if body.isNull():
browser.log.error('Document has no body, cannot apply keep_only')
return
keep = []
for selector in keep_only:
keep.extend(x for x in mf.findAllElements(selector))
if not keep:
browser.log.error('Failed to find any elements matching the keep_only selectors: %r' % keep_only)
return
for elem in keep:
body.appendInside(elem)
for elem in tuple(children(body)):
preserve = False
for x in keep:
if x == elem:
preserve = True
break
if preserve:
break
elem.removeFromDocument()
def apply_remove(browser, remove):
mf = browser.page.mainFrame()
for selector in remove:
for elem in mf.findAllElements(selector):
if not elem.isNull():
elem.removeFromDocument()
def remove_beyond(browser, selector, before=True):
mf = browser.page.mainFrame()
elem = mf.findFirstElement(selector)
if elem.isNull():
browser.log('Failed to find any element matching the selector: %s' % selector)
return
next_sibling = attrgetter('previousSibling' if before else 'nextSibling')
while not elem.isNull() and unicode(elem.tagName()) != 'body':
remove = []
after = next_sibling(elem)()
while not after.isNull():
remove.append(after)
after = next_sibling(after)()
for x in remove:
x.removeFromDocument()
elem = elem.parent()
def is_tag(elem, name):
return unicode(elem.tagName()).lower() == name.lower()
def download_resources(browser, resource_cache, output_dir):
img_counter = style_counter = 0
resources = defaultdict(list)
for img in browser.css_select('img[src]', all=True):
# Using javascript ensures that absolute URLs are returned, direct
# attribute access does not do that
src = unicode(img.evaluateJavaScript('this.src').toString()).strip()
if src:
resources[src].append(img)
for link in browser.css_select('link[href]', all=True):
lt = unicode(link.attribute('type')).strip() or 'text/css'
rel = unicode(link.attribute('rel')).strip() or 'stylesheet'
if lt == 'text/css' and rel == 'stylesheet':
href = unicode(link.evaluateJavaScript('this.href').toString()).strip()
if href:
resources[href].append(link)
else:
link.removeFromDocument()
else:
link.removeFromDocument()
loaded_resources = browser.wait_for_resources(resources)
for url, raw in loaded_resources.iteritems():
h = hashlib.sha1(raw).digest()
if h in resource_cache:
href = os.path.relpath(resource_cache[h], output_dir).replace(os.sep, '/')
else:
elem = resources[url][0]
if is_tag(elem, 'link'):
style_counter += 1
href = 'style_%d.css' % style_counter
else:
img_counter += 1
ext = what(None, raw) or 'jpg'
href = 'img_%d.%s' % (img_counter, ext)
dest = os.path.join(output_dir, href)
resource_cache[h] = dest
with open(dest, 'wb') as f:
f.write(raw)
for elem in resources[url]:
elem.setAttribute('href' if is_tag(elem, 'link') else 'src', href)
failed = set(resources) - set(loaded_resources)
for url in failed:
browser.log.warn('Failed to download resource:', url)
for elem in resources[url]:
elem.removeFromDocument()
def save_html(browser, output_dir, postprocess_html, url, recursion_level):
html = strip_encoding_declarations(browser.html)
import html5lib
root = html5lib.parse(html, treebuilder='lxml', namespaceHTMLElements=False).getroot()
root = postprocess_html(root, url, recursion_level)
if root is None:
# user wants this page to be aborted
raise AbortFetch('%s was aborted during postprocess' % url)
with open(os.path.join(output_dir, 'index.html'), 'wb') as f:
from lxml.html import tostring
f.write(tostring(root, include_meta_content_type=True, encoding='utf-8', pretty_print=True))
return f.name
def links_from_selectors(selectors, recursions, browser, url, recursion_level):
ans = []
if recursions > recursion_level:
for selector in selectors:
for a in browser.css_select(selector, all=True):
href = unicode(a.evaluateJavaScript('this.href').toString()).strip()
if href:
ans.append(href)
return ans
def clean_dom(
browser, url, recursion_level, preprocess_browser, remove_javascript,
keep_only, remove_after, remove_before, remove):
# Remove comments as otherwise we can end up with nested comments, which
# cause problems later
browser.page.mainFrame().evaluateJavaScript(remove_comments)
preprocess_browser(browser, url, 1, recursion_level)
if remove_javascript:
for elem in browser.css_select('script', all=True):
elem.removeFromDocument()
if keep_only:
apply_keep_only(browser, keep_only)
if remove_after:
remove_beyond(browser, remove_after, before=False)
if remove_before:
remove_beyond(browser, remove_before, before=True)
if remove:
apply_remove(browser, remove)
preprocess_browser(browser, url, 2, recursion_level)
def fetch_page(
url=None,
load_complete=lambda browser, url, recursion_level: True,
links=lambda browser, url, recursion_level: (),
keep_only=(),
remove_after=None,
remove_before=None,
remove=(),
remove_javascript=True,
delay=0,
preprocess_browser=lambda browser, url, stage, recursion_level:None,
postprocess_html=lambda root, url, recursion_level: root,
resource_cache={},
output_dir=None,
browser=None,
recursion_level=0
):
output_dir = output_dir or os.getcwdu()
if browser is None:
browser = jsbrowser()
if delay:
time.sleep(delay)
# Load the DOM
if url is not None:
start_time = time.time()
browser.start_load(url)
while not load_complete(browser, url, recursion_level):
browser.run_for_a_time(0.1)
if time.time() - start_time > browser.default_timeout:
raise Timeout('Timed out while waiting for %s to load' % url)
children = links(browser, url, recursion_level)
# Cleanup the DOM
clean_dom(
browser, url, recursion_level, preprocess_browser,
remove_javascript, keep_only, remove_after, remove_before, remove)
# Download resources
download_resources(browser, resource_cache, output_dir)
# Get HTML from the DOM
pages = [save_html(browser, output_dir, postprocess_html, url, recursion_level)]
# Fetch the linked pages
for i, curl in enumerate(children):
odir = os.path.join(output_dir, 'link%d' % (i + 1))
if not os.path.exists(odir):
os.mkdir(odir)
try:
pages.extend(fetch_page(
curl, load_complete=load_complete, links=links, keep_only=keep_only,
remove_after=remove_after, remove_before=remove_before, remove=remove,
preprocess_browser=preprocess_browser, postprocess_html=postprocess_html,
resource_cache=resource_cache, output_dir=odir, browser=browser, delay=delay,
recursion_level=recursion_level+1))
except AbortFetch:
continue
return tuple(pages)
if __name__ == '__main__':
browser = jsbrowser()
fetch_page('http://www.time.com/time/magazine/article/0,9171,2145057,00.html', browser=browser,
links=partial(links_from_selectors, ('.wp-paginate a.page[href]',), 1),
keep_only=('article.post',), remove=('.entry-sharing', '.entry-footer', '.wp-paginate', '.post-rail'))

View File

@ -7,26 +7,30 @@ __license__ = 'GPL v3'
__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import os, pprint, time
import os, pprint, time, uuid
from cookielib import Cookie
from threading import current_thread
from PyQt4.Qt import (QObject, QNetworkAccessManager, QNetworkDiskCache,
QNetworkProxy, QNetworkProxyFactory, QEventLoop, QUrl, pyqtSignal,
QDialog, QVBoxLayout, QSize, QNetworkCookieJar, Qt, pyqtSlot)
QDialog, QVBoxLayout, QSize, QNetworkCookieJar, Qt, pyqtSlot, QPixmap)
from PyQt4.QtWebKit import QWebPage, QWebSettings, QWebView, QWebElement
from calibre import USER_AGENT, prints, get_proxies, get_proxy_info
from calibre import USER_AGENT, prints, get_proxies, get_proxy_info, prepare_string_for_xml
from calibre.constants import ispy3, cache_dir
from calibre.ptempfile import PersistentTemporaryDirectory
from calibre.utils.logging import ThreadSafeLog
from calibre.gui2 import must_use_qt
from calibre.web.jsbrowser.forms import FormsMixin
from calibre.web.jsbrowser.forms import FormsMixin, default_timeout
class Timeout(Exception): pass
class Timeout(Exception):
pass
class LoadError(Exception): pass
class LoadError(Exception):
pass
class WebPage(QWebPage): # {{{
class WebPage(QWebPage): # {{{
def __init__(self, log,
confirm_callback=None,
@ -48,6 +52,26 @@ class WebPage(QWebPage): # {{{
QWebSettings.enablePersistentStorage(os.path.join(cache_dir(),
'webkit-persistence'))
QWebSettings.setMaximumPagesInCache(0)
self.bridge_name = 'b' + uuid.uuid4().get_hex()
self.mainFrame().javaScriptWindowObjectCleared.connect(
self.add_window_objects)
self.dom_loaded = False
def add_window_objects(self):
self.dom_loaded = False
mf = self.mainFrame()
mf.addToJavaScriptWindowObject(self.bridge_name, self)
mf.evaluateJavaScript('document.addEventListener( "DOMContentLoaded", %s.content_loaded, false )' % self.bridge_name)
def load_url(self, url):
self.dom_loaded = False
url = QUrl(url)
self.mainFrame().load(url)
self.ready_state # Without this, DOMContentLoaded does not fire for file:// URLs
@pyqtSlot()
def content_loaded(self):
self.dom_loaded = True
def userAgentForUrl(self, url):
return self.user_agent
@ -96,9 +120,28 @@ class WebPage(QWebPage): # {{{
def ready_state(self):
return unicode(self.mainFrame().evaluateJavaScript('document.readyState').toString())
@pyqtSlot(QPixmap)
def transfer_image(self, img):
self.saved_img = img
def get_image(self, qwe_or_selector):
qwe = qwe_or_selector
if not isinstance(qwe, QWebElement):
qwe = self.mainFrame().findFirstElement(qwe)
if qwe.isNull():
raise ValueError('Failed to find element with selector: %r'
% qwe_or_selector)
self.saved_img = QPixmap()
qwe.evaluateJavaScript('%s.transfer_image(this)' % self.bridge_name)
try:
return self.saved_img
finally:
del self.saved_img
# }}}
class ProxyFactory(QNetworkProxyFactory): # {{{
class ProxyFactory(QNetworkProxyFactory): # {{{
def __init__(self, log):
QNetworkProxyFactory.__init__(self)
@ -107,9 +150,11 @@ class ProxyFactory(QNetworkProxyFactory): # {{{
for scheme, proxy_string in proxies.iteritems():
scheme = scheme.lower()
info = get_proxy_info(scheme, proxy_string)
if info is None: continue
if info is None:
continue
hn, port = info['hostname'], info['port']
if not hn or not port: continue
if not hn or not port:
continue
log.debug('JSBrowser using proxy:', pprint.pformat(info))
pt = {'socks5':QNetworkProxy.Socks5Proxy}.get(scheme,
QNetworkProxy.HttpProxy)
@ -128,21 +173,22 @@ class ProxyFactory(QNetworkProxyFactory): # {{{
return [self.proxies.get(scheme, self.default_proxy)]
# }}}
class NetworkAccessManager(QNetworkAccessManager): # {{{
class NetworkAccessManager(QNetworkAccessManager): # {{{
OPERATION_NAMES = { getattr(QNetworkAccessManager, '%sOperation'%x) :
OPERATION_NAMES = {getattr(QNetworkAccessManager, '%sOperation'%x) :
x.upper() for x in ('Head', 'Get', 'Put', 'Post', 'Delete',
'Custom')
}
report_reply_signal = pyqtSignal(object)
def __init__(self, log, use_disk_cache=True, parent=None):
def __init__(self, log, disk_cache_size=50, parent=None):
QNetworkAccessManager.__init__(self, parent)
self.reply_count = 0
self.log = log
if use_disk_cache:
if disk_cache_size > 0:
self.cache = QNetworkDiskCache(self)
self.cache.setCacheDirectory(os.path.join(cache_dir(), 'jsbrowser'))
self.cache.setCacheDirectory(PersistentTemporaryDirectory(prefix='disk_cache_'))
self.cache.setMaximumCacheSize(int(disk_cache_size * 1024 * 1024))
self.setCache(self.cache)
self.sslErrors.connect(self.on_ssl_errors)
self.pf = ProxyFactory(log)
@ -194,10 +240,11 @@ class NetworkAccessManager(QNetworkAccessManager): # {{{
def report_reply(self, reply):
reply_url = unicode(reply.url().toString())
self.reply_count += 1
err = reply.error()
if reply.error():
self.log.warn("Reply error: %s - %d (%s)" %
(reply_url, reply.error(), reply.errorString()))
if err:
l = self.log.debug if err == reply.OperationCanceledError else self.log.warn
l("Reply error: %s - %d (%s)" % (reply_url, err, unicode(reply.errorString())))
else:
debug = []
debug.append("Reply successful: %s" % reply_url)
@ -230,18 +277,18 @@ class NetworkAccessManager(QNetworkAccessManager): # {{{
c = Cookie(0, # version
name, value,
None, # port
False, # port specified
False, # port specified
domain, domain_specified, initial_dot, path,
path_specified,
secure, expires, is_session_cookie,
None, # Comment
None, # Comment URL
{} # rest
None, # Comment
None, # Comment URL
{} # rest
)
yield c
# }}}
class LoadWatcher(QObject): # {{{
class LoadWatcher(QObject): # {{{
def __init__(self, page, parent=None):
QObject.__init__(self, parent)
@ -257,7 +304,7 @@ class LoadWatcher(QObject): # {{{
self.page = None
# }}}
class BrowserView(QDialog): # {{{
class BrowserView(QDialog): # {{{
def __init__(self, page, parent=None):
QDialog.__init__(self, parent)
@ -283,7 +330,7 @@ class Browser(QObject, FormsMixin):
def __init__(self,
# Logging. If None, uses a default log, which does not output
# debugging info
log = None,
log=None,
# Receives a string and returns True/False. By default, returns
# True for all strings
confirm_callback=None,
@ -296,14 +343,20 @@ class Browser(QObject, FormsMixin):
# User agent to be used
user_agent=USER_AGENT,
# If True a disk cache is used
use_disk_cache=True,
# The size (in MB) of the on disk cache. Note that because the disk
# cache cannot be shared between different instances, we currently
# use a temporary dir for the cache, which is deleted on
# program exit. Set to zero to disable cache.
disk_cache_size=50,
# Enable Inspect element functionality
enable_developer_tools=False,
# Verbosity
verbosity = 0
verbosity=0,
# The default timeout (in seconds)
default_timeout=30
):
must_use_qt()
QObject.__init__(self)
@ -314,12 +367,13 @@ class Browser(QObject, FormsMixin):
if verbosity:
log.filter_level = log.DEBUG
self.log = log
self.default_timeout = default_timeout
self.page = WebPage(log, confirm_callback=confirm_callback,
prompt_callback=prompt_callback, user_agent=user_agent,
enable_developer_tools=enable_developer_tools,
parent=self)
self.nam = NetworkAccessManager(log, use_disk_cache=use_disk_cache, parent=self)
self.nam = NetworkAccessManager(log, disk_cache_size=disk_cache_size, parent=self)
self.page.setNetworkAccessManager(self.nam)
@property
@ -327,6 +381,7 @@ class Browser(QObject, FormsMixin):
return self.page.user_agent
def _wait_for_load(self, timeout, url=None):
timeout = self.default_timeout if timeout is default_timeout else timeout
loop = QEventLoop(self)
start_time = time.time()
end_time = start_time + timeout
@ -358,7 +413,16 @@ class Browser(QObject, FormsMixin):
if not loop.processEvents():
time.sleep(0.1)
def visit(self, url, timeout=30.0):
def wait_for_element(self, selector, timeout=default_timeout):
timeout = self.default_timeout if timeout is default_timeout else timeout
start_time = time.time()
while self.css_select(selector) is None:
self.run_for_a_time(0.1)
if time.time() - start_time > timeout:
raise Timeout('DOM failed to load in %.1g seconds' % timeout)
return self.css_select(selector)
def visit(self, url, timeout=default_timeout):
'''
Open the page specified in URL and wait for it to complete loading.
Note that when this method returns, there may still be javascript
@ -369,14 +433,38 @@ class Browser(QObject, FormsMixin):
Returns True if loading was successful, False otherwise.
'''
self.current_form = None
self.page.mainFrame().load(QUrl(url))
self.page.load_url(url)
return self._wait_for_load(timeout, url)
def back(self, wait_for_load=True, timeout=default_timeout):
'''
Like clicking the back button in the browser. Waits for loading to complete.
This method will raise a Timeout exception if loading takes more than timeout seconds.
Returns True if loading was successful, False otherwise.
'''
self.page.triggerAction(self.page.Back)
if wait_for_load:
return self._wait_for_load(timeout)
def stop(self):
'Stop loading of current page'
self.page.triggerAction(self.page.Stop)
def stop_scheduled_refresh(self):
'Stop any scheduled page refresh/reloads'
self.page.triggerAction(self.page.StopScheduledPageRefresh)
def reload(self, bypass_cache=False):
action = self.page.ReloadAndBypassCache if bypass_cache else self.page.Reload
self.page.triggerAction(action)
@property
def dom_ready(self):
return self.page.ready_state in {'complete', 'interactive'}
return self.page.dom_loaded
def wait_till_dom_ready(self, timeout=30.0, url=None):
def wait_till_dom_ready(self, timeout=default_timeout, url=None):
timeout = self.default_timeout if timeout is default_timeout else timeout
start_time = time.time()
while not self.dom_ready:
if time.time() - start_time > timeout:
@ -384,18 +472,20 @@ class Browser(QObject, FormsMixin):
url, timeout))
self.run_for_a_time(0.1)
def start_load(self, url, timeout=30.0):
def start_load(self, url, timeout=default_timeout, selector=None):
'''
Start the loading of the page at url and return once the DOM is ready,
sub-resources such as scripts/stylesheets/images/etc. may not have all
loaded.
'''
self.current_form = None
self.page.mainFrame().load(QUrl(url))
self.run_for_a_time(0.01)
self.wait_till_dom_ready(timeout=timeout, url=url)
self.page.load_url(url)
if selector is not None:
self.wait_for_element(selector, timeout=timeout, url=url)
else:
self.wait_till_dom_ready(timeout=timeout, url=url)
def click(self, qwe_or_selector, wait_for_load=True, ajax_replies=0, timeout=30.0):
def click(self, qwe_or_selector, wait_for_load=True, ajax_replies=0, timeout=default_timeout):
'''
Click the :class:`QWebElement` pointed to by qwe_or_selector.
@ -408,8 +498,8 @@ class Browser(QObject, FormsMixin):
initial_count = self.nam.reply_count
qwe = qwe_or_selector
if not isinstance(qwe, QWebElement):
qwe = self.page.mainFrame().findFirstElement(qwe)
if qwe.isNull():
qwe = self.css_select(qwe)
if qwe is None:
raise ValueError('Failed to find element with selector: %r'
% qwe_or_selector)
js = '''
@ -425,7 +515,7 @@ class Browser(QObject, FormsMixin):
raise LoadError('Clicking resulted in a failed load')
def click_text_link(self, text_or_regex, selector='a[href]',
wait_for_load=True, ajax_replies=0, timeout=30.0):
wait_for_load=True, ajax_replies=0, timeout=default_timeout):
target = None
for qwe in self.page.mainFrame().findAllElements(selector):
src = unicode(qwe.toPlainText())
@ -441,6 +531,88 @@ class Browser(QObject, FormsMixin):
return self.click(target, wait_for_load=wait_for_load,
ajax_replies=ajax_replies, timeout=timeout)
def css_select(self, selector, all=False):
if all:
return tuple(self.page.mainFrame().findAllElements(selector).toList())
ans = self.page.mainFrame().findFirstElement(selector)
if ans.isNull():
ans = None
return ans
def get_image(self, qwe_or_selector):
'''
Return the image identified by qwe_or_selector as a QPixmap. If no such
image exists, the returned pixmap will be null.
'''
return self.page.get_image(qwe_or_selector)
def get_cached(self, url):
iod = self.nam.cache.data(QUrl(url))
if iod is not None:
try:
return bytes(bytearray(iod.readAll()))
finally:
# Ensure the IODevice is closed right away, so that the
# underlying file can be deleted if the space is needed,
# otherwise on windows the file stays locked
iod.close()
del iod
def wait_for_resources(self, urls, timeout=default_timeout):
timeout = self.default_timeout if timeout is default_timeout else timeout
start_time = time.time()
ans = {}
urls = set(urls)
def get_resources():
for url in tuple(urls):
raw = self.get_cached(url)
if raw is not None:
ans[url] = raw
urls.discard(url)
while urls and time.time() - start_time > timeout and self.page.ready_state not in {'complete', 'completed'}:
get_resources()
if urls:
self.run_for_a_time(0.1)
if urls:
get_resources()
return ans
def get_resource(self, url, rtype='img', use_cache=True, timeout=default_timeout):
'''
Download a resource (image/stylesheet/script). The resource is
downloaded by visiting an simple HTML page that contains only that
resource. The resource is then returned from the cache (therefore, to
use this method you must not disable the cache). If use_cache is True
then the cache is queried before loading the resource. This can result
in a stale object if the resource has changed on the server, however,
it is a big performance boost in the common case, by avoiding a
roundtrip to the server. The resource is returned as a bytestring or None
if it could not be loaded.
'''
if not hasattr(self.nam, 'cache'):
raise RuntimeError('Cannot get resources when the cache is disabled')
if use_cache:
ans = self.get_cached(url)
if ans is not None:
return ans
try:
tag = {
'img': '<img src="%s">',
'link': '<link href="%s"></link>',
'script': '<script src="%s"></script>',
}[rtype] % prepare_string_for_xml(url, attribute=True)
except KeyError:
raise ValueError('Unknown resource type: %s' % rtype)
self.page.mainFrame().setHtml(
'''<!DOCTYPE html><html><body><div>{0}</div></body></html>'''.format(tag))
self._wait_for_load(timeout)
ans = self.get_cached(url)
if ans is not None:
return ans
def show_browser(self):
'''
@ -461,11 +633,18 @@ class Browser(QObject, FormsMixin):
def html(self):
return unicode(self.page.mainFrame().toHtml())
def close(self):
def blank(self):
try:
self.visit('about:blank', timeout=0.01)
except Timeout:
pass
def close(self):
self.stop()
self.blank()
self.stop()
self.nam.setCache(QNetworkDiskCache())
self.nam.cache = None
self.nam = self.page = None
def __enter__(self):
@ -474,3 +653,5 @@ class Browser(QObject, FormsMixin):
def __exit__(self, *args):
self.close()

View File

@ -10,6 +10,8 @@ __docformat__ = 'restructuredtext en'
from calibre import as_unicode
default_timeout = object()
# Forms {{{
class Control(object):
@ -43,7 +45,7 @@ class Control(object):
self.qwe.setAttribute('value', as_unicode(val))
elif self.type in ('number', 'range'):
self.qwe.setAttribute('value', '%d'%int(val))
else: # Unknown type treat as text
else: # Unknown type treat as text
self.qwe.setAttribute('value', as_unicode(val))
return property(fget=fget, fset=fset)
@ -221,7 +223,7 @@ class FormsMixin(object):
return self.current_form
def submit(self, submit_control_selector=None, wait_for_load=True,
ajax_replies=0, timeout=30.0):
ajax_replies=0, timeout=default_timeout):
'''
Submit the currently selected form. Tries to autodetect the submit
control. You can override auto-detection by specifying a CSS2 selector
@ -238,7 +240,7 @@ class FormsMixin(object):
ajax_replies=ajax_replies, timeout=timeout)
def ajax_submit(self, submit_control_selector=None,
num_of_replies=1, timeout=30.0):
num_of_replies=1, timeout=default_timeout):
'''
Submit the current form. This method is meant for those forms that
use AJAX rather than a plain submit. It will block until the specified
@ -249,3 +251,4 @@ class FormsMixin(object):
wait_for_load=False, ajax_replies=num_of_replies,
timeout=timeout)