mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
[Sync] Sync with trunk. Revison 7760
This commit is contained in:
commit
5c52c0f6bb
324
Changelog.yaml
324
Changelog.yaml
@ -4,6 +4,330 @@
|
|||||||
# for important features/bug fixes.
|
# for important features/bug fixes.
|
||||||
# Also, each release can have new and improved recipes.
|
# Also, each release can have new and improved recipes.
|
||||||
|
|
||||||
|
- version: 0.7.42
|
||||||
|
date: 2011-01-21
|
||||||
|
|
||||||
|
new features:
|
||||||
|
- title: "0.7.42 is a re-release of 0.7.41, because conversion to MOBI was broken in 0.7.41"
|
||||||
|
|
||||||
|
- title: "Conversions: Replace the remove header/footer options with a more geenric search replace option, that allows you to not only remove but also replace text"
|
||||||
|
|
||||||
|
- title: "Conversion: The preprocess html option has now become a new 'Heuristic Processing' option which allows you to control exactly which heuristics are used"
|
||||||
|
|
||||||
|
- title: "Conversion: Various improvements to Heuristic Processing (used to be preprocess HTML)"
|
||||||
|
|
||||||
|
- title: "When adding empty books to calibre, optionally set the author to the author of the currently selected book"
|
||||||
|
tickets: [7702]
|
||||||
|
|
||||||
|
- title: "Device drivers for the Archos 101, SmatQ T7 and Acer Lumiread"
|
||||||
|
|
||||||
|
- title: "Catalog generation: Make By Authors optional"
|
||||||
|
|
||||||
|
- title: "Allow bulk editing of Date and Published columns."
|
||||||
|
|
||||||
|
- title: "Add a little button to clear date and published values to the edit metadata dialogs"
|
||||||
|
|
||||||
|
- title: "When adding books by ISBN, allow the specification of special tags that will be added to the new book entries"
|
||||||
|
tickets: [8436]
|
||||||
|
|
||||||
|
- title: "Completion on multiple authors"
|
||||||
|
tickets: [8405]
|
||||||
|
|
||||||
|
- title: "Add AZW to default list of internally viewed formats, a I am tired of getting tickets about it"
|
||||||
|
|
||||||
|
- title: "Nicer error message when catalog generation fails"
|
||||||
|
|
||||||
|
- title: "Add capitalize option to context menus in the edit metadata dialog"
|
||||||
|
|
||||||
|
bug fixes:
|
||||||
|
- title: "RTF Input: Fix regression in 0.7.40 that broke conversion of some old style RTF files"
|
||||||
|
|
||||||
|
- title: "Fix Tag editor forgets position"
|
||||||
|
tickets: [8271]
|
||||||
|
|
||||||
|
- title: "When converting books in the calibre GUI, override metadata from the input document, even when empty."
|
||||||
|
description: >
|
||||||
|
"So if you have removed all the tags and comments in the calibre GUI for the book in the calibre GUI, but the actual file that is being converted still has tags and comments, they are ignored. This affects only conversions in the calibre GUI, not from the command line via ebook-convert."
|
||||||
|
tickets: [8390]
|
||||||
|
|
||||||
|
- title: "Fix memory leak when switching libraries"
|
||||||
|
|
||||||
|
- title: "RTF Output: Fix incorrent spacing between letters."
|
||||||
|
tickets: [8422]
|
||||||
|
|
||||||
|
- title: "Catalog generation: Add composite columns to Merge Comments eligible types"
|
||||||
|
|
||||||
|
- title: "Add a confirmation when closing the add a custom news source dialog."
|
||||||
|
tickets: [8460]
|
||||||
|
|
||||||
|
- title: "Another workaround for LibraryThing UA sniffing that was preventing series metadata download, sigh."
|
||||||
|
tickets: [8477]
|
||||||
|
|
||||||
|
- title: "PD Novel driver: Put books on the SD card into the eBooks folder"
|
||||||
|
|
||||||
|
- title: "When shortening filepaths to conform to windows path length limitations, remove text from the middle of each component instead of the ends."
|
||||||
|
tickets: [8451]
|
||||||
|
|
||||||
|
- title: "Make completion in most places case insensitive"
|
||||||
|
tickets: [8441]
|
||||||
|
|
||||||
|
- title: "Fix regression that caused the N key to stop working when editing a Yes/no column"
|
||||||
|
tickets: [8417]
|
||||||
|
|
||||||
|
- title: "Email: Fix bug when connecting to SMTP relays that use MD5 auth"
|
||||||
|
|
||||||
|
- title: "MOBI Output: Fix bug that could cause a link pointing to the start of a section to go to a point later in the section is the section contained an empty id attribute"
|
||||||
|
|
||||||
|
- title: "When auto converting books and the device is unplugged, do not raise an error."
|
||||||
|
tickets: [8426]
|
||||||
|
|
||||||
|
- title: "Ebook-viewer: Display cover when viewing FB2 files"
|
||||||
|
|
||||||
|
- title: "MOBI Input: Special case handling of emptu div tags with a defined height used as paragraph separators."
|
||||||
|
tickets: [8391]
|
||||||
|
|
||||||
|
- title: "Fix sorting of author names into sub categories by first letter in the Tag Browser when the first letter has diacritics"
|
||||||
|
tickets: [8378]
|
||||||
|
|
||||||
|
- title: "Fix regression in 0.7.40 that caused commas in author names to become | when converting/saving to disk"
|
||||||
|
|
||||||
|
- title: "Fix view specific format on a book with no formats gives an error"
|
||||||
|
tickets: [8352]
|
||||||
|
|
||||||
|
|
||||||
|
improved recipes:
|
||||||
|
- Blic
|
||||||
|
- Las Vegas Review Journal
|
||||||
|
- La Vanguardia
|
||||||
|
- New York Times
|
||||||
|
- El Pais
|
||||||
|
- Seattle Times
|
||||||
|
- Ars Technica
|
||||||
|
- Dilbert
|
||||||
|
- Nature News
|
||||||
|
|
||||||
|
new recipes:
|
||||||
|
- title: "kath.net"
|
||||||
|
author: "Bobus"
|
||||||
|
|
||||||
|
- title: "iHNed"
|
||||||
|
author: "Karel Bilek"
|
||||||
|
|
||||||
|
- title: "Gulf News"
|
||||||
|
author: "Darko Miletic"
|
||||||
|
|
||||||
|
- title: "South Africa Mail and Guardian"
|
||||||
|
author: "77ja65"
|
||||||
|
|
||||||
|
|
||||||
|
- version: 0.7.40
|
||||||
|
date: 2011-01-14
|
||||||
|
|
||||||
|
new features:
|
||||||
|
- title: "A new 'highlight matches' search mode"
|
||||||
|
description: >
|
||||||
|
"There is now a checkbox next to the search bar named 'Highlight'. If you check it, searching will highlight
|
||||||
|
all matched books instead of filtering the book list to all matched books."
|
||||||
|
|
||||||
|
- title: "RTF Input: Improved support for conversion of images. The bug where some images were shrunk should no longer happen"
|
||||||
|
|
||||||
|
- title: "Template language: Allow you to create your own formatting functions. Accessible via Preferences->Advanced->Template functions"
|
||||||
|
|
||||||
|
- title: "News download: Convert various HTML 5 tags into <div> to support readers that cannot handle HTML 5 tags"
|
||||||
|
|
||||||
|
- title: "RTF metadata: Add support for publisher and tags."
|
||||||
|
tickets: [6657]
|
||||||
|
|
||||||
|
- title: "BibTeX catalog: Add support for custom columns"
|
||||||
|
|
||||||
|
- title: "TXT Input: Support for textile markup"
|
||||||
|
|
||||||
|
- title: "Various minor tweaks to improve usability of Preferences->Plugins"
|
||||||
|
|
||||||
|
- title: "TXT Output: Convert <hr> to scene break marker."
|
||||||
|
|
||||||
|
- title: "Support for the Archos 70"
|
||||||
|
|
||||||
|
- title: "SONY Driver: Add an option to automatically refresh the covers on every connect. Accessible via: Preferences->Plugins->Device interface plugins"
|
||||||
|
|
||||||
|
- title: "Add access to the larger template editor from plugboards via context menu."
|
||||||
|
|
||||||
|
- title: "Speed improvement when connecting a large library to a device"
|
||||||
|
|
||||||
|
- title: "Speedup when searching on multiple words in a large library"
|
||||||
|
|
||||||
|
- title: "TXT Input: Add a heauristic formatting processor"
|
||||||
|
|
||||||
|
|
||||||
|
bug fixes:
|
||||||
|
- title: "Fix bug that caused automatic news removal to remove any book that has a tag that contains the word 'news' instead of only books that have the tag News"
|
||||||
|
|
||||||
|
- title: "Refactor the downloading social metadata message box to allow canceling."
|
||||||
|
tickets: [8234]
|
||||||
|
|
||||||
|
- title: "Kobo drive does not deal with Null value in DateCreated column"
|
||||||
|
tickets: [8308]
|
||||||
|
|
||||||
|
- title: "MOBI Input: Fix regression that caused images placed inside svg tags to be discarded"
|
||||||
|
|
||||||
|
- title: "Fix selecting Tablet output profile would actually select the Samsung Galaxy S profile"
|
||||||
|
|
||||||
|
- title: "Catalog generation: Fix a condition that could cause TOCs to not be properly generated in MOBI format catalogs"
|
||||||
|
tickets: [8295]
|
||||||
|
|
||||||
|
- title: "Zip file reading: Be more tolerant when a zip file has a damaged file directory"
|
||||||
|
|
||||||
|
- title: "RTF Input: Various code cleanups. Go back to trying to handle unicode mappings without pre-processing. This will mean that some RTF files that used to convert, won't anymore. Please open tickets and attach them."
|
||||||
|
tickets: [8171]
|
||||||
|
|
||||||
|
- title: "ImageMagick: When identifying an image don't read the entire image"
|
||||||
|
|
||||||
|
- title: "FB2 Output: Add cover to FB2 metadata."
|
||||||
|
|
||||||
|
- title: "Fix inability to customize builting recipe when more than one recipe has the same name"
|
||||||
|
tickets: [8281]
|
||||||
|
|
||||||
|
- title: "RTF Input: Fix regression that broke the Preprocess HTML option"
|
||||||
|
|
||||||
|
- title: "Fix XSS vulnerability in content server."
|
||||||
|
tickets: [7980]
|
||||||
|
|
||||||
|
- title: "TXT Output: Clean up and produce consistant output. Spacing around headings. Headings are not indented when using the remove paragraph spacing option."
|
||||||
|
|
||||||
|
- title: "Catalog generation: Handle invalid covers gracefully"
|
||||||
|
|
||||||
|
- title: "Email settings: Before displaying the email test dialog warn the user that it will expose their email password"
|
||||||
|
|
||||||
|
- title: "PDB Output: Fix regression that caused some PDB files to not work with other software"
|
||||||
|
tickets: [8231]
|
||||||
|
|
||||||
|
improved recipes:
|
||||||
|
- Financial Times UK
|
||||||
|
- Globe and Mail
|
||||||
|
- Wired Daily
|
||||||
|
- MIT Technology Review
|
||||||
|
- MSNBC
|
||||||
|
- expansion.com
|
||||||
|
- New York Times
|
||||||
|
- Heraldo de Aragon
|
||||||
|
- Exiled online
|
||||||
|
|
||||||
|
new recipes:
|
||||||
|
- title: "Yakima Herald and Tri-City Herald"
|
||||||
|
author: "Laura Gjovaag"
|
||||||
|
|
||||||
|
- title: "Wichita Eagle"
|
||||||
|
author: "Jason Cameron"
|
||||||
|
|
||||||
|
- title: "Pressthink and Zero Hedge"
|
||||||
|
author: "Darko Miletic"
|
||||||
|
|
||||||
|
- title: "tyzden"
|
||||||
|
author: "zemiak"
|
||||||
|
|
||||||
|
- title: "El Correo"
|
||||||
|
author: "desUBIKado"
|
||||||
|
|
||||||
|
- title: "Cicero"
|
||||||
|
author: "mad"
|
||||||
|
|
||||||
|
- title: "El Publico"
|
||||||
|
author: "Gerardo Diez"
|
||||||
|
|
||||||
|
- version: 0.7.38
|
||||||
|
date: 2011-01-07
|
||||||
|
|
||||||
|
new features:
|
||||||
|
- title: "Reduce startup time when using a composite custom column"
|
||||||
|
|
||||||
|
- title: "Template language: Add a list_item function for use with tags like columns. See User Manual for details"
|
||||||
|
|
||||||
|
- title: "TXT Input: Attempt to detect the input encoding when not specified. Auto detect paragraph structure and formatting markup."
|
||||||
|
|
||||||
|
- title: "Search & replace: Add ability to manipulate number and boolean columns."
|
||||||
|
|
||||||
|
- title: "Add type ahead completion to the advanced search dialog."
|
||||||
|
tickets: [8035]
|
||||||
|
|
||||||
|
- title: "Double click on plugin in Preferences dialog to customize"
|
||||||
|
tickets: [8175]
|
||||||
|
|
||||||
|
- title: "Allow customization of the SONY driver to send thumbnail to the device. Useful with newer SONY readers"
|
||||||
|
tickets: [8161]
|
||||||
|
|
||||||
|
- title: "Smarten punctuation: Convert double dashes to em dashes. Preprocessing: Various tweaks"
|
||||||
|
|
||||||
|
bug fixes:
|
||||||
|
- title: "Fix regression causing the template formatter to intepret a missing format letter as ERROR instead of 's'."
|
||||||
|
|
||||||
|
- title: "Fix regression that broke conversion of PNG images in PDF files on OS X."
|
||||||
|
tickets: [8215]
|
||||||
|
|
||||||
|
- title: "Content server: Fix improper XML escaping of category titles in the OPDS feeds"
|
||||||
|
tickets: [8225]
|
||||||
|
|
||||||
|
- title: "When decoding XML if the XML starts with a UTF-8 BOM decode as UTF-8. Fixes parsing of FB2 files with UTF-8 BOMs"
|
||||||
|
|
||||||
|
- title: "E-book viewer: When scrolling to a bookmark and the content is wider than the window, do not scroll in the horizontal direction"
|
||||||
|
|
||||||
|
- title: "E-book viewer: Fix next page skipping the bottom of chapters when the content is wider than the window."
|
||||||
|
tickets: [8153]
|
||||||
|
|
||||||
|
- title: " FB2 Output: Insert covers."
|
||||||
|
tickets: [8172]
|
||||||
|
|
||||||
|
- title: "Content server: When serving OPDS feeds handle html descriptions that have namespaced attributes."
|
||||||
|
tickets: [7938]
|
||||||
|
|
||||||
|
- title: "When downloading metadata from isbndb.com, download a maximum of 30 results rather than 1000"
|
||||||
|
|
||||||
|
- title: "Fix sorting of tags column"
|
||||||
|
|
||||||
|
- title: "Change search/replace to show commas instead of vertical bars as the separator for multiple authors"
|
||||||
|
|
||||||
|
- title: "Template language: Make all column names case insensitive"
|
||||||
|
|
||||||
|
- title: "Fix bug that prevent the Disabled option for Tag Browser partiotining from working in the Preferences dialog"
|
||||||
|
|
||||||
|
- title: "Fix bug when using tags like custom column in the template language"
|
||||||
|
|
||||||
|
- title: "Fix bug where composite custom columns using general_program_mode fields are not evaluated correctly when used in a template."
|
||||||
|
|
||||||
|
- title: "ImageMagick interface: Don't crash when asked to open empty image files"
|
||||||
|
|
||||||
|
- title: "Kobo driver: Add TXT,CBZ,CBR to supported formats list"
|
||||||
|
tickets: [8124]
|
||||||
|
|
||||||
|
- title: "Don't uneccessarily scroll the book list horizontally when re-selcting previously selected rows."
|
||||||
|
|
||||||
|
new recipes:
|
||||||
|
- title: "New London Day"
|
||||||
|
author: "Being"
|
||||||
|
|
||||||
|
- title: "Walla"
|
||||||
|
author: "marbs"
|
||||||
|
|
||||||
|
- title: "New Journal of Physics"
|
||||||
|
author: "Chema Cortes"
|
||||||
|
|
||||||
|
- title: "The Baltimore Sun"
|
||||||
|
author: "Josh Hall"
|
||||||
|
|
||||||
|
- title: "Arabian Business and Sunday Times (UK)"
|
||||||
|
author: "Darko Miletic"
|
||||||
|
|
||||||
|
- title: "Deia"
|
||||||
|
author: "Gerardo Diez"
|
||||||
|
|
||||||
|
- title: "Smarter Planet"
|
||||||
|
author: "Jack Mason"
|
||||||
|
|
||||||
|
|
||||||
|
improved recipes:
|
||||||
|
- The Atlantic
|
||||||
|
- Danas
|
||||||
|
- Ledevoir
|
||||||
|
|
||||||
- version: 0.7.37
|
- version: 0.7.37
|
||||||
date: 2011-01-02
|
date: 2011-01-02
|
||||||
|
|
||||||
|
@ -1,6 +1,4 @@
|
|||||||
@echo OFF
|
@echo OFF
|
||||||
REM CalibreRun.bat
|
|
||||||
REM ~~~~~~~~~~~~~~
|
|
||||||
REM Batch File to start a Calibre configuration on Windows
|
REM Batch File to start a Calibre configuration on Windows
|
||||||
REM giving explicit control of the location of:
|
REM giving explicit control of the location of:
|
||||||
REM - Calibe Program Files
|
REM - Calibe Program Files
|
||||||
@ -24,7 +22,10 @@ REM -------------------------------------
|
|||||||
REM Set up Calibre Config folder
|
REM Set up Calibre Config folder
|
||||||
REM -------------------------------------
|
REM -------------------------------------
|
||||||
|
|
||||||
If EXIST CalibreConfig SET CALIBRE_CONFIG_DIRECTORY=%cd%\CalibreConfig
|
IF EXIST CalibreConfig (
|
||||||
|
SET CALIBRE_CONFIG_DIRECTORY=%cd%\CalibreConfig
|
||||||
|
ECHO CONFIG=%cd%\CalibreConfig
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
REM --------------------------------------------------------------
|
REM --------------------------------------------------------------
|
||||||
@ -38,9 +39,18 @@ REM drive letter of the USB stick.
|
|||||||
REM Comment out any of the following that are not to be used
|
REM Comment out any of the following that are not to be used
|
||||||
REM --------------------------------------------------------------
|
REM --------------------------------------------------------------
|
||||||
|
|
||||||
SET CALIBRE_LIBRARY_DIRECTORY=U:\eBOOKS\CalibreLibrary
|
IF EXIST U:\eBooks\CalibreLibrary (
|
||||||
IF EXIST CalibreLibrary SET CALIBRE_LIBRARY_DIRECTORY=%cd%\CalibreLibrary
|
SET CALIBRE_LIBRARY_DIRECTORY=U:\eBOOKS\CalibreLibrary
|
||||||
IF EXIST CalibreBooks SET CALIBRE_LIBRARY_DIRECTORY=%cd%\CalibreBooks
|
ECHO LIBRARY=U:\eBOOKS\CalibreLibrary
|
||||||
|
)
|
||||||
|
IF EXIST CalibreLibrary (
|
||||||
|
SET CALIBRE_LIBRARY_DIRECTORY=%cd%\CalibreLibrary
|
||||||
|
ECHO LIBRARY=%cd%\CalibreLibrary
|
||||||
|
)
|
||||||
|
IF EXIST CalibreBooks (
|
||||||
|
SET CALIBRE_LIBRARY_DIRECTORY=%cd%\CalibreBooks
|
||||||
|
ECHO LIBRARY=%cd%\CalibreBooks
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
REM --------------------------------------------------------------
|
REM --------------------------------------------------------------
|
||||||
@ -50,12 +60,32 @@ REM Location where the metadata.db file is located. If not set
|
|||||||
REM the same location as Books files will be assumed. This.
|
REM the same location as Books files will be assumed. This.
|
||||||
REM options is used to get better performance when the Library is
|
REM options is used to get better performance when the Library is
|
||||||
REM on a (slow) network drive. Putting the metadata.db file
|
REM on a (slow) network drive. Putting the metadata.db file
|
||||||
REM locally gives a big performance improvement.
|
REM locally makes gives a big performance improvement.
|
||||||
|
REM
|
||||||
|
REM NOTE. If you use this option, then the ability to switch
|
||||||
|
REM libraries within Calibre will be disabled. Therefore
|
||||||
|
REM you do not want to set it if the metadata.db file
|
||||||
|
REM is at the same location as the book files.
|
||||||
REM --------------------------------------------------------------
|
REM --------------------------------------------------------------
|
||||||
|
|
||||||
IF EXIST CalibreBooks SET SET CALIBRE_OVERRIDE_DATABASE_PATH=%cd%\CalibreBooks\metadata.db
|
IF EXIST CalibreBooks (
|
||||||
IF EXIST CalibreMetadata SET CALIBRE_OVERRIDE_DATABASE_PATH=%cd%\CalibreMetadata\metadata.db
|
IF NOT "%CALIBRE_LIBRARY_DIRECTORY%" == "%cd%\CalibreBooks" (
|
||||||
|
SET SET CALIBRE_OVERRIDE_DATABASE_PATH=%cd%\CalibreBooks\metadata.db
|
||||||
|
ECHO DATABASE=%cd%\CalibreBooks\metadata.db
|
||||||
|
ECHO '
|
||||||
|
ECHO ***CAUTION*** Library Switching will be disabled
|
||||||
|
ECHO '
|
||||||
|
)
|
||||||
|
)
|
||||||
|
IF EXIST CalibreMetadata (
|
||||||
|
IF NOT "%CALIBRE_LIBRARY_DIRECTORY%" == "%cd%\CalibreMetadata" (
|
||||||
|
SET CALIBRE_OVERRIDE_DATABASE_PATH=%cd%\CalibreMetadata\metadata.db
|
||||||
|
ECHO DATABASE=%cd%\CalibreMetadata\metadata.db
|
||||||
|
ECHO '
|
||||||
|
ECHO ***CAUTION*** Library Switching will be disabled
|
||||||
|
ECHO '
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
REM --------------------------------------------------------------
|
REM --------------------------------------------------------------
|
||||||
REM Specify Location of source (optional)
|
REM Specify Location of source (optional)
|
||||||
@ -63,13 +93,20 @@ REM
|
|||||||
REM It is easy to run Calibre from source
|
REM It is easy to run Calibre from source
|
||||||
REM Just set the environment variable to where the source is located
|
REM Just set the environment variable to where the source is located
|
||||||
REM When running from source the GUI will have a '*' after the version.
|
REM When running from source the GUI will have a '*' after the version.
|
||||||
|
REM number that is displayed at the bottom of the Calibre main screen.
|
||||||
REM --------------------------------------------------------------
|
REM --------------------------------------------------------------
|
||||||
|
|
||||||
IF EXIST Calibre\src SET CALIBRE_DEVELOP_FROM=%cd%\Calibre\src
|
IF EXIST Calibre\src (
|
||||||
|
SET CALIBRE_DEVELOP_FROM=%cd%\Calibre\src
|
||||||
|
ECHO SOURCE=%cd%\Calibre\src
|
||||||
|
)
|
||||||
|
IF EXIST D:\Calibre\Calibre\src (
|
||||||
|
SET CALIBRE_DEVELOP_FROM=D:\Calibre\Calibre\src
|
||||||
|
ECHO SOURCE=D:\Calibre\Calibre\src
|
||||||
|
)
|
||||||
|
|
||||||
REM --------------------------------------------------------------
|
REM --------------------------------------------------------------
|
||||||
REM Specify Location of calibre binaries (optinal)
|
REM Specify Location of calibre binaries (optional)
|
||||||
REM
|
REM
|
||||||
REM To avoid needing Calibre to be set in the search path, ensure
|
REM To avoid needing Calibre to be set in the search path, ensure
|
||||||
REM that Calibre Program Files is current directory when starting.
|
REM that Calibre Program Files is current directory when starting.
|
||||||
@ -78,21 +115,15 @@ REM This folder can be populated by cpying the Calibre2 folder from
|
|||||||
REM an existing isntallation or by isntalling direct to here.
|
REM an existing isntallation or by isntalling direct to here.
|
||||||
REM --------------------------------------------------------------
|
REM --------------------------------------------------------------
|
||||||
|
|
||||||
IF EXIST Calibre2 CD Calibre2
|
IF EXIST Calibre2 (
|
||||||
|
Calibre2 CD Calibre2
|
||||||
|
ECHO PROGRAMS=%cd%
|
||||||
REM --------------------------------------------
|
)
|
||||||
REM Display settings that will be used
|
|
||||||
REM --------------------------------------------
|
|
||||||
|
|
||||||
echo PROGRAMS=%cd%
|
|
||||||
echo SOURCE=%CALIBRE_DEVELOP_FROM%
|
|
||||||
echo CONFIG=%CALIBRE_CONFIG_DIRECTORY%
|
|
||||||
echo LIBRARY=%CALIBRE_LIBRARY_DIRECTORY%
|
|
||||||
echo DATABASE=%CALIBRE_OVERRIDE_DATABASE_PATH%
|
|
||||||
|
|
||||||
|
REM ----------------------------------------------------------
|
||||||
REM The following gives a chance to check the settings before
|
REM The following gives a chance to check the settings before
|
||||||
REM starting Calibre. It can be commented out if not wanted.
|
REM starting Calibre. It can be commented out if not wanted.
|
||||||
|
REM ----------------------------------------------------------
|
||||||
|
|
||||||
echo "Press CTRL-C if you do not want to continue"
|
echo "Press CTRL-C if you do not want to continue"
|
||||||
pause
|
pause
|
||||||
@ -111,4 +142,4 @@ REM Use with /WAIT to wait until Calibre completes to run a task on exit
|
|||||||
REM --------------------------------------------------------
|
REM --------------------------------------------------------
|
||||||
|
|
||||||
echo "Starting up Calibre"
|
echo "Starting up Calibre"
|
||||||
START /belownormal Calibre --with-library %CALIBRE_LIBRARY_DIRECTORY%
|
START /belownormal Calibre --with-library "%CALIBRE_LIBRARY_DIRECTORY%"
|
||||||
|
42
resources/catalog/section_list_templates.py
Normal file
42
resources/catalog/section_list_templates.py
Normal file
@ -0,0 +1,42 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||||
|
|
||||||
|
__license__ = 'GPL v3'
|
||||||
|
__copyright__ = '2010, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||||
|
__docformat__ = 'restructuredtext en'
|
||||||
|
|
||||||
|
'''
|
||||||
|
These templates control the content of titles displayed in the various sections
|
||||||
|
|
||||||
|
Available fields:
|
||||||
|
{title} Title of the book
|
||||||
|
{series} Series name
|
||||||
|
{series_index} Number of the book in the series
|
||||||
|
{rating} Rating
|
||||||
|
{rating_parens} Rating, in parentheses
|
||||||
|
{pubyear} Year the book was published
|
||||||
|
{pubyear_parens} Year the book was published, in parentheses
|
||||||
|
|
||||||
|
'''
|
||||||
|
# Books by Author
|
||||||
|
by_authors_normal_title_template = '{title} {pubyear_parens}'
|
||||||
|
by_authors_series_title_template = '[{series_index}] {title} {pubyear_parens}'
|
||||||
|
|
||||||
|
# Books by Title
|
||||||
|
by_titles_normal_title_template = '{title}'
|
||||||
|
by_titles_series_title_template = '{title} ({series} [{series_index}])'
|
||||||
|
|
||||||
|
# Books by Series
|
||||||
|
by_series_title_template = '[{series_index}] {title} {pubyear_parens}'
|
||||||
|
|
||||||
|
# Books by Genre
|
||||||
|
by_genres_normal_title_template = '{title} {pubyear_parens}'
|
||||||
|
by_genres_series_title_template = '{series_index}. {title} {pubyear_parens}'
|
||||||
|
|
||||||
|
# Recently Added
|
||||||
|
by_recently_added_normal_title_template = '{title}'
|
||||||
|
by_recently_added_series_title_template = '{title} ({series} [{series_index}])'
|
||||||
|
|
||||||
|
# By Month added
|
||||||
|
by_month_added_normal_title_template = '{title} {pubyear_parens}'
|
||||||
|
by_month_added_series_title_template = '[{series_index}] {title} {pubyear_parens}'
|
BIN
resources/images/document-encrypt.png
Normal file
BIN
resources/images/document-encrypt.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 8.8 KiB |
BIN
resources/images/heuristics.png
Normal file
BIN
resources/images/heuristics.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 9.3 KiB |
BIN
resources/images/news/exiled.png
Normal file
BIN
resources/images/news/exiled.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 1.3 KiB |
BIN
resources/images/news/pressthink.png
Normal file
BIN
resources/images/news/pressthink.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 533 B |
BIN
resources/images/news/zerohedge.png
Normal file
BIN
resources/images/news/zerohedge.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 3.0 KiB |
BIN
resources/images/template_funcs.png
Normal file
BIN
resources/images/template_funcs.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 16 KiB |
@ -1,6 +1,5 @@
|
|||||||
|
|
||||||
__license__ = 'GPL v3'
|
__license__ = 'GPL v3'
|
||||||
__copyright__ = '2008-2010, Darko Miletic <darko.miletic at gmail.com>'
|
__copyright__ = '2008-2011, Darko Miletic <darko.miletic at gmail.com>'
|
||||||
'''
|
'''
|
||||||
arstechnica.com
|
arstechnica.com
|
||||||
'''
|
'''
|
||||||
@ -9,19 +8,26 @@ import re
|
|||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag
|
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag
|
||||||
|
|
||||||
class ArsTechnica2(BasicNewsRecipe):
|
class ArsTechnica(BasicNewsRecipe):
|
||||||
title = u'Ars Technica'
|
title = u'Ars Technica'
|
||||||
language = 'en'
|
language = 'en'
|
||||||
__author__ = 'Darko Miletic and Sujata Raman'
|
__author__ = 'Darko Miletic, Sujata Raman, Alexis Rohou'
|
||||||
description = 'The art of technology'
|
description = 'The art of technology'
|
||||||
publisher = 'Ars Technica'
|
publisher = 'Ars Technica'
|
||||||
category = 'news, IT, technology'
|
category = 'news, IT, technology'
|
||||||
oldest_article = 2
|
oldest_article = 5
|
||||||
max_articles_per_feed = 100
|
max_articles_per_feed = 100
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
encoding = 'utf-8'
|
encoding = 'utf-8'
|
||||||
use_embedded_content = False
|
use_embedded_content = False
|
||||||
extra_css = ' body {font-family: Arial,Helvetica,sans-serif} .title{text-align: left} .byline{font-weight: bold; line-height: 1em; font-size: 0.625em; text-decoration: none} '
|
extra_css = '''
|
||||||
|
body {font-family: Arial,Helvetica,sans-serif}
|
||||||
|
.title{text-align: left}
|
||||||
|
.byline{font-weight: bold; line-height: 1em; font-size: 0.625em; text-decoration: none}
|
||||||
|
.news-item-figure-caption-text{font-size:small; font-style:italic}
|
||||||
|
.news-item-figure-caption-byline{font-size:small; font-style:italic; font-weight:bold}
|
||||||
|
'''
|
||||||
|
ignoreEtcArticles = True # Etc feed items can be ignored, as they're not real stories
|
||||||
|
|
||||||
conversion_options = {
|
conversion_options = {
|
||||||
'comments' : description
|
'comments' : description
|
||||||
@ -31,10 +37,10 @@ class ArsTechnica2(BasicNewsRecipe):
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
preprocess_regexps = [
|
#preprocess_regexps = [
|
||||||
(re.compile(r'<div class="news-item-figure', re.DOTALL|re.IGNORECASE),lambda match: '<div class="news-item-figure"')
|
# (re.compile(r'<div class="news-item-figure', re.DOTALL|re.IGNORECASE),lambda match: '<div class="news-item-figure"')
|
||||||
,(re.compile(r'</title>.*?</head>', re.DOTALL|re.IGNORECASE),lambda match: '</title></head>')
|
# ,(re.compile(r'</title>.*?</head>', re.DOTALL|re.IGNORECASE),lambda match: '</title></head>')
|
||||||
]
|
# ]
|
||||||
|
|
||||||
keep_only_tags = [dict(name='div', attrs={'id':['story','etc-story']})]
|
keep_only_tags = [dict(name='div', attrs={'id':['story','etc-story']})]
|
||||||
|
|
||||||
@ -42,7 +48,7 @@ class ArsTechnica2(BasicNewsRecipe):
|
|||||||
dict(name=['object','link','embed'])
|
dict(name=['object','link','embed'])
|
||||||
,dict(name='div', attrs={'class':'read-more-link'})
|
,dict(name='div', attrs={'class':'read-more-link'})
|
||||||
]
|
]
|
||||||
remove_attributes=['width','height']
|
#remove_attributes=['width','height']
|
||||||
|
|
||||||
feeds = [
|
feeds = [
|
||||||
(u'Infinite Loop (Apple content)' , u'http://feeds.arstechnica.com/arstechnica/apple/' )
|
(u'Infinite Loop (Apple content)' , u'http://feeds.arstechnica.com/arstechnica/apple/' )
|
||||||
@ -56,6 +62,7 @@ class ArsTechnica2(BasicNewsRecipe):
|
|||||||
,(u'Law & Disorder (Tech policy content)' , u'http://feeds.arstechnica.com/arstechnica/tech-policy/')
|
,(u'Law & Disorder (Tech policy content)' , u'http://feeds.arstechnica.com/arstechnica/tech-policy/')
|
||||||
]
|
]
|
||||||
|
|
||||||
|
# This deals with multi-page stories
|
||||||
def append_page(self, soup, appendtag, position):
|
def append_page(self, soup, appendtag, position):
|
||||||
pager = soup.find('div',attrs={'class':'pager'})
|
pager = soup.find('div',attrs={'class':'pager'})
|
||||||
if pager:
|
if pager:
|
||||||
@ -81,6 +88,7 @@ class ArsTechnica2(BasicNewsRecipe):
|
|||||||
|
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
def preprocess_html(self, soup):
|
||||||
|
# Adds line breaks near the byline (not sure why this is needed)
|
||||||
ftag = soup.find('div', attrs={'class':'byline'})
|
ftag = soup.find('div', attrs={'class':'byline'})
|
||||||
if ftag:
|
if ftag:
|
||||||
brtag = Tag(soup,'br')
|
brtag = Tag(soup,'br')
|
||||||
@ -88,12 +96,33 @@ class ArsTechnica2(BasicNewsRecipe):
|
|||||||
ftag.insert(4,brtag)
|
ftag.insert(4,brtag)
|
||||||
ftag.insert(5,brtag2)
|
ftag.insert(5,brtag2)
|
||||||
|
|
||||||
|
# Remove style items
|
||||||
for item in soup.findAll(style=True):
|
for item in soup.findAll(style=True):
|
||||||
del item['style']
|
del item['style']
|
||||||
|
|
||||||
|
# Remove id
|
||||||
|
for item in soup.findAll(id=True):
|
||||||
|
del item['id']
|
||||||
|
|
||||||
|
# For some reason, links to authors don't have the domainname
|
||||||
|
a_author = soup.find('a',{'href':re.compile("^/author")})
|
||||||
|
if a_author:
|
||||||
|
a_author['href'] = 'http://arstechnica.com'+a_author['href']
|
||||||
|
|
||||||
|
# within div class news-item-figure, we need to grab images
|
||||||
|
|
||||||
|
# Deal with multi-page stories
|
||||||
self.append_page(soup, soup.body, 3)
|
self.append_page(soup, soup.body, 3)
|
||||||
|
|
||||||
return soup
|
return soup
|
||||||
|
|
||||||
def get_article_url(self, article):
|
def get_article_url(self, article):
|
||||||
|
# If the article title starts with Etc:, don't return it
|
||||||
|
if self.ignoreEtcArticles:
|
||||||
|
article_title = article.get('title',None)
|
||||||
|
if re.match('Etc: ',article_title) is not None:
|
||||||
|
return None
|
||||||
|
|
||||||
|
# The actual article is in a guid tag
|
||||||
return article.get('guid', None).rpartition('?')[0]
|
return article.get('guid', None).rpartition('?')[0]
|
||||||
|
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
|
|
||||||
__license__ = 'GPL v3'
|
__license__ = 'GPL v3'
|
||||||
__copyright__ = '2008-2010, Darko Miletic <darko.miletic at gmail.com>'
|
__copyright__ = '2008-2011, Darko Miletic <darko.miletic at gmail.com>'
|
||||||
'''
|
'''
|
||||||
blic.rs
|
blic.rs
|
||||||
'''
|
'''
|
||||||
@ -21,21 +21,53 @@ class Blic(BasicNewsRecipe):
|
|||||||
masthead_url = 'http://www.blic.rs/resources/images/header/header_back.png'
|
masthead_url = 'http://www.blic.rs/resources/images/header/header_back.png'
|
||||||
language = 'sr'
|
language = 'sr'
|
||||||
publication_type = 'newspaper'
|
publication_type = 'newspaper'
|
||||||
extra_css = '@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)} body{font-family: Georgia, serif1, serif} .article_description{font-family: Arial, sans1, sans-serif} .img_full{float: none} img{margin-bottom: 0.8em} '
|
extra_css = """
|
||||||
|
@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)}
|
||||||
|
@font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)}
|
||||||
|
body{font-family: Georgia, serif1, serif}
|
||||||
|
.articledescription,#nadnaslov,.article_info{font-family: Arial, sans1, sans-serif}
|
||||||
|
.img_full{float: none}
|
||||||
|
#nadnaslov{font-size: small}
|
||||||
|
#article_lead{font-size: 1.5em}
|
||||||
|
h1{color: red}
|
||||||
|
.potpis{font-size: x-small; color: gray}
|
||||||
|
.article_info{font-size: small}
|
||||||
|
img{margin-bottom: 0.8em; margin-top: 0.8em; display: block}
|
||||||
|
"""
|
||||||
|
|
||||||
conversion_options = {
|
conversion_options = {
|
||||||
'comment' : description
|
'comment' : description
|
||||||
, 'tags' : category
|
, 'tags' : category
|
||||||
, 'publisher': publisher
|
, 'publisher': publisher
|
||||||
, 'language' : language
|
, 'language' : language
|
||||||
|
, 'linearize_tables' : True
|
||||||
}
|
}
|
||||||
|
|
||||||
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
|
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
|
||||||
remove_tags_before = dict(name='div', attrs={'id':'article_info'})
|
remove_tags_before = dict(name='div', attrs={'id':'article_info'})
|
||||||
remove_tags = [dict(name=['object','link'])]
|
remove_tags = [dict(name=['object','link','meta','base','object','embed'])]
|
||||||
remove_attributes = ['width','height']
|
remove_attributes = ['width','height','m_id','m_ext','mlg_id','poll_id','v_id']
|
||||||
|
|
||||||
feeds = [(u'Danasnje Vesti', u'http://www.blic.rs/rss/danasnje-vesti')]
|
feeds = [
|
||||||
|
(u'Politika' , u'http://www.blic.rs/rss/Vesti/Politika')
|
||||||
|
,(u'Tema Dana' , u'http://www.blic.rs/rss/Vesti/Tema-Dana')
|
||||||
|
,(u'Svet' , u'http://www.blic.rs/rss/Vesti/Svet')
|
||||||
|
,(u'Drustvo' , u'http://www.blic.rs/rss/Vesti/Drustvo')
|
||||||
|
,(u'Ekonomija' , u'http://www.blic.rs/rss/Vesti/Ekonomija')
|
||||||
|
,(u'Hronika' , u'http://www.blic.rs/rss/Vesti/Hronika')
|
||||||
|
,(u'Beograd' , u'http://www.blic.rs/rss/Vesti/Beograd')
|
||||||
|
,(u'Srbija' , u'http://www.blic.rs/rss/Vesti/Srbija')
|
||||||
|
,(u'Vojvodina' , u'http://www.blic.rs/rss/Vesti/Vojvodina')
|
||||||
|
,(u'Republika Srpska' , u'http://www.blic.rs/rss/Vesti/Republika-Srpska')
|
||||||
|
,(u'Reportaza' , u'http://www.blic.rs/rss/Vesti/Reportaza')
|
||||||
|
,(u'Dodatak' , u'http://www.blic.rs/rss/Vesti/Dodatak')
|
||||||
|
,(u'Zabava' , u'http://www.blic.rs/rss/Zabava')
|
||||||
|
,(u'Kultura' , u'http://www.blic.rs/rss/Kultura')
|
||||||
|
,(u'Slobodno Vreme' , u'http://www.blic.rs/rss/Slobodno-vreme')
|
||||||
|
,(u'IT' , u'http://www.blic.rs/rss/IT')
|
||||||
|
,(u'Komentar' , u'http://www.blic.rs/rss/Komentar')
|
||||||
|
,(u'Intervju' , u'http://www.blic.rs/rss/Intervju')
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
def print_version(self, url):
|
def print_version(self, url):
|
||||||
@ -44,4 +76,4 @@ class Blic(BasicNewsRecipe):
|
|||||||
def preprocess_html(self, soup):
|
def preprocess_html(self, soup):
|
||||||
for item in soup.findAll(style=True):
|
for item in soup.findAll(style=True):
|
||||||
del item['style']
|
del item['style']
|
||||||
return self.adeify_images(soup)
|
return soup
|
||||||
|
35
resources/recipes/cicero.recipe
Normal file
35
resources/recipes/cicero.recipe
Normal file
@ -0,0 +1,35 @@
|
|||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class Cicero(BasicNewsRecipe):
|
||||||
|
timefmt = ' [%Y-%m-%d]'
|
||||||
|
title = u'Cicero'
|
||||||
|
__author__ = 'mad@sharktooth.de'
|
||||||
|
description = u'Magazin f\xfcr politische Kultur'
|
||||||
|
oldest_article = 7
|
||||||
|
language = 'de'
|
||||||
|
max_articles_per_feed = 100
|
||||||
|
no_stylesheets = True
|
||||||
|
use_embedded_content = False
|
||||||
|
publisher = 'Ringier Publishing'
|
||||||
|
category = 'news, politics, Germany'
|
||||||
|
encoding = 'iso-8859-1'
|
||||||
|
publication_type = 'magazine'
|
||||||
|
masthead_url = 'http://www.cicero.de/img2/cicero_logo_rss.gif'
|
||||||
|
feeds = [
|
||||||
|
(u'Das gesamte Portfolio', u'http://www.cicero.de/rss/rss.php?ress_id='),
|
||||||
|
#(u'Alle Heft-Inhalte', u'http://www.cicero.de/rss/rss.php?ress_id=heft'),
|
||||||
|
#(u'Alle Online-Inhalte', u'http://www.cicero.de/rss/rss.php?ress_id=online'),
|
||||||
|
#(u'Berliner Republik', u'http://www.cicero.de/rss/rss.php?ress_id=4'),
|
||||||
|
#(u'Weltb\xfchne', u'http://www.cicero.de/rss/rss.php?ress_id=1'),
|
||||||
|
#(u'Salon', u'http://www.cicero.de/rss/rss.php?ress_id=7'),
|
||||||
|
#(u'Kapital', u'http://www.cicero.de/rss/rss.php?ress_id=6'),
|
||||||
|
#(u'Netzst\xfccke', u'http://www.cicero.de/rss/rss.php?ress_id=9'),
|
||||||
|
#(u'Leinwand', u'http://www.cicero.de/rss/rss.php?ress_id=12'),
|
||||||
|
#(u'Bibliothek', u'http://www.cicero.de/rss/rss.php?ress_id=15'),
|
||||||
|
(u'Kolumne - Alle Kolulmnen', u'http://www.cicero.de/rss/rss2.php?ress_id='),
|
||||||
|
#(u'Kolumne - Schreiber, Berlin', u'http://www.cicero.de/rss/rss2.php?ress_id=35'),
|
||||||
|
#(u'Kolumne - TV Kritik', u'http://www.cicero.de/rss/rss2.php?ress_id=34')
|
||||||
|
]
|
||||||
|
|
||||||
|
def print_version(self, url):
|
||||||
|
return 'http://www.cicero.de/page_print.php?' + url.rpartition('?')[2]
|
@ -11,7 +11,7 @@ class CNetJapan(BasicNewsRecipe):
|
|||||||
(u'CNet Blog', u'http://feed.japan.cnet.com/rss/blog/index.rdf')
|
(u'CNet Blog', u'http://feed.japan.cnet.com/rss/blog/index.rdf')
|
||||||
]
|
]
|
||||||
language = 'ja'
|
language = 'ja'
|
||||||
encoding = 'Shift_JIS'
|
encoding = 'utf-8'
|
||||||
remove_javascript = True
|
remove_javascript = True
|
||||||
|
|
||||||
preprocess_regexps = [
|
preprocess_regexps = [
|
||||||
|
@ -7,22 +7,29 @@ class DallasNews(BasicNewsRecipe):
|
|||||||
max_articles_per_feed = 25
|
max_articles_per_feed = 25
|
||||||
|
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
remove_tags_before = dict(name='h2', attrs={'class':'vitstoryheadline'})
|
use_embedded_content = False
|
||||||
remove_tags_after = dict(name='div', attrs={'style':'width: 100%; clear: right'})
|
remove_tags_before = dict(name='h1')
|
||||||
remove_tags_after = dict(name='div', attrs={'id':'article_tools_bottom'})
|
keep_only_tags = {'class':lambda x: x and 'article' in x}
|
||||||
remove_tags = [
|
remove_tags = [
|
||||||
dict(name='iframe'),
|
{'class':['DMNSocialTools', 'article ', 'article first ', 'article premium']},
|
||||||
dict(name='div', attrs={'class':'biblockmore'}),
|
|
||||||
dict(name='div', attrs={'style':'width: 100%; clear: right'}),
|
|
||||||
dict(name='div', attrs={'id':'article_tools_bottom'}),
|
|
||||||
#dict(name='ul', attrs={'class':'articleTools'}),
|
|
||||||
]
|
]
|
||||||
|
|
||||||
feeds = [
|
feeds = [
|
||||||
('Latest News', 'http://www.dallasnews.com/newskiosk/rss/dallasnewslatestnews.xml'),
|
('Local News',
|
||||||
('Local News', 'http://www.dallasnews.com/newskiosk/rss/dallasnewslocalnews.xml'),
|
'http://www.dallasnews.com/news/politics/local-politics/?rss'),
|
||||||
('Nation and World', 'http://www.dallasnews.com/newskiosk/rss/dallasnewsnationworld.xml'),
|
('National Politics',
|
||||||
('Politics', 'http://www.dallasnews.com/newskiosk/rss/dallasnewsnationalpolitics.xml'),
|
'http://www.dallasnews.com/news/politics/national-politic/?rss'),
|
||||||
('Science', 'http://www.dallasnews.com/newskiosk/rss/dallasnewsscience.xml'),
|
('State Politics',
|
||||||
|
'http://www.dallasnews.com/news/politics/state-politics/?rss'),
|
||||||
|
('Religion',
|
||||||
|
'http://www.dallasnews.com/news/religion/?rss'),
|
||||||
|
('Crime',
|
||||||
|
'http://www.dallasnews.com/news/crime/headlines/?rss'),
|
||||||
|
('Celebrity News',
|
||||||
|
'http://www.dallasnews.com/entertainment/celebrity-news/?rss&listname=TopStories'),
|
||||||
|
('Nation',
|
||||||
|
'http://www.dallasnews.com/news/nation-world/nation/?rss'),
|
||||||
|
('World',
|
||||||
|
'http://www.dallasnews.com/news/nation-world/world/?rss'),
|
||||||
]
|
]
|
||||||
|
|
||||||
|
@ -22,7 +22,7 @@ class Deia(BasicNewsRecipe):
|
|||||||
cover_url ='http://2.bp.blogspot.com/_RjrWzC6tI14/TM6jrPLaBZI/AAAAAAAAFaI/ayffwxidFEY/s1600/2009-10-13-logo-deia.jpg'
|
cover_url ='http://2.bp.blogspot.com/_RjrWzC6tI14/TM6jrPLaBZI/AAAAAAAAFaI/ayffwxidFEY/s1600/2009-10-13-logo-deia.jpg'
|
||||||
timefmt ='[%a, %d %b, %Y]'
|
timefmt ='[%a, %d %b, %Y]'
|
||||||
encoding ='utf8'
|
encoding ='utf8'
|
||||||
language ='es_ES'
|
language ='es'
|
||||||
remove_javascript =True
|
remove_javascript =True
|
||||||
remove_tags_after =dict(id='Texto')
|
remove_tags_after =dict(id='Texto')
|
||||||
remove_tags_before =dict(id='Texto')
|
remove_tags_before =dict(id='Texto')
|
||||||
|
@ -28,7 +28,7 @@ class DilbertBig(BasicNewsRecipe):
|
|||||||
,'publisher' : publisher
|
,'publisher' : publisher
|
||||||
}
|
}
|
||||||
|
|
||||||
feeds = [(u'Dilbert', u'http://feeds.dilbert.com/DilbertDailyStrip' )]
|
feeds = [(u'Dilbert', u'http://feed.dilbert.com/dilbert/daily_strip' )]
|
||||||
|
|
||||||
def get_article_url(self, article):
|
def get_article_url(self, article):
|
||||||
return article.get('feedburner_origlink', None)
|
return article.get('feedburner_origlink', None)
|
||||||
|
@ -9,7 +9,7 @@ from calibre.web.feeds.news import BasicNewsRecipe
|
|||||||
from calibre.ebooks.BeautifulSoup import BeautifulSoup
|
from calibre.ebooks.BeautifulSoup import BeautifulSoup
|
||||||
from calibre.ebooks.BeautifulSoup import Tag, NavigableString
|
from calibre.ebooks.BeautifulSoup import Tag, NavigableString
|
||||||
|
|
||||||
import mechanize, string, urllib, time, re
|
import string, time, re
|
||||||
|
|
||||||
class Economist(BasicNewsRecipe):
|
class Economist(BasicNewsRecipe):
|
||||||
|
|
||||||
@ -18,19 +18,19 @@ class Economist(BasicNewsRecipe):
|
|||||||
|
|
||||||
__author__ = "Kovid Goyal"
|
__author__ = "Kovid Goyal"
|
||||||
INDEX = 'http://www.economist.com/printedition'
|
INDEX = 'http://www.economist.com/printedition'
|
||||||
description = ('Global news and current affairs from a European perspective.'
|
description = 'Global news and current affairs from a European perspective.'
|
||||||
' Needs a subscription from ')+INDEX
|
|
||||||
|
|
||||||
oldest_article = 7.0
|
oldest_article = 7.0
|
||||||
cover_url = 'http://www.economist.com/images/covers/currentcoverus_large.jpg'
|
cover_url = 'http://www.economist.com/images/covers/currentcoverus_large.jpg'
|
||||||
remove_tags = [dict(name=['script', 'noscript', 'title', 'iframe', 'cf_floatingcontent']),
|
remove_tags = [dict(name=['script', 'noscript', 'title', 'iframe', 'cf_floatingcontent']),
|
||||||
dict(attrs={'class':['dblClkTrk', 'ec-article-info']})]
|
dict(attrs={'class':['dblClkTrk', 'ec-article-info']})]
|
||||||
keep_only_tags = [dict(id='ec-article-body')]
|
keep_only_tags = [dict(id='ec-article-body')]
|
||||||
needs_subscription = True
|
needs_subscription = False
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
preprocess_regexps = [(re.compile('</html>.*', re.DOTALL),
|
preprocess_regexps = [(re.compile('</html>.*', re.DOTALL),
|
||||||
lambda x:'</html>')]
|
lambda x:'</html>')]
|
||||||
|
|
||||||
|
'''
|
||||||
def get_browser(self):
|
def get_browser(self):
|
||||||
br = BasicNewsRecipe.get_browser()
|
br = BasicNewsRecipe.get_browser()
|
||||||
br.open('http://www.economist.com')
|
br.open('http://www.economist.com')
|
||||||
@ -50,6 +50,7 @@ class Economist(BasicNewsRecipe):
|
|||||||
}))
|
}))
|
||||||
br.open(req).read()
|
br.open(req).read()
|
||||||
return br
|
return br
|
||||||
|
'''
|
||||||
|
|
||||||
def parse_index(self):
|
def parse_index(self):
|
||||||
try:
|
try:
|
||||||
|
@ -7,12 +7,12 @@ from lxml import html
|
|||||||
|
|
||||||
class Economist(BasicNewsRecipe):
|
class Economist(BasicNewsRecipe):
|
||||||
|
|
||||||
title = 'The Economist (free)'
|
title = 'The Economist (RSS)'
|
||||||
language = 'en'
|
language = 'en'
|
||||||
|
|
||||||
__author__ = "Kovid Goyal"
|
__author__ = "Kovid Goyal"
|
||||||
description = ('Global news and current affairs from a European perspective.'
|
description = ('Global news and current affairs from a European perspective.'
|
||||||
' Much slower than the subscription based version.')
|
' Much slower than the print edition based version.')
|
||||||
|
|
||||||
oldest_article = 7.0
|
oldest_article = 7.0
|
||||||
cover_url = 'http://www.economist.com/images/covers/currentcoverus_large.jpg'
|
cover_url = 'http://www.economist.com/images/covers/currentcoverus_large.jpg'
|
||||||
|
122
resources/recipes/el_correo.recipe
Normal file
122
resources/recipes/el_correo.recipe
Normal file
@ -0,0 +1,122 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
__license__ = 'GPL v3'
|
||||||
|
__copyright__ = '08 Januery 2011, desUBIKado'
|
||||||
|
__author__ = 'desUBIKado'
|
||||||
|
__description__ = 'Daily newspaper from Biscay'
|
||||||
|
__version__ = 'v0.08'
|
||||||
|
__date__ = '08, Januery 2011'
|
||||||
|
'''
|
||||||
|
[url]http://www.elcorreo.com/[/url]
|
||||||
|
'''
|
||||||
|
|
||||||
|
import time
|
||||||
|
import re
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class heraldo(BasicNewsRecipe):
|
||||||
|
__author__ = 'desUBIKado'
|
||||||
|
description = 'Daily newspaper from Biscay'
|
||||||
|
title = u'El Correo'
|
||||||
|
publisher = 'Vocento'
|
||||||
|
category = 'News, politics, culture, economy, general interest'
|
||||||
|
oldest_article = 2
|
||||||
|
delay = 1
|
||||||
|
max_articles_per_feed = 100
|
||||||
|
no_stylesheets = True
|
||||||
|
use_embedded_content = False
|
||||||
|
language = 'es'
|
||||||
|
timefmt = '[%a, %d %b, %Y]'
|
||||||
|
encoding = 'iso-8859-1'
|
||||||
|
remove_empty_feeds = True
|
||||||
|
remove_javascript = False
|
||||||
|
|
||||||
|
feeds = [
|
||||||
|
(u'Portada', u'http://www.elcorreo.com/vizcaya/portada.xml'),
|
||||||
|
(u'Local', u'http://www.elcorreo.com/vizcaya/rss/feeds/vizcaya.xml'),
|
||||||
|
(u'Internacional', u'hhttp://www.elcorreo.com/vizcaya/rss/feeds/internacional.xml'),
|
||||||
|
(u'Econom\xeda', u'http://www.elcorreo.com/vizcaya/rss/feeds/economia.xml'),
|
||||||
|
(u'Pol\xedtica', u'http://www.elcorreo.com/vizcaya/rss/feeds/politica.xml'),
|
||||||
|
(u'Opini\xf3n', u'http://www.elcorreo.com/vizcaya/rss/feeds/opinion.xml'),
|
||||||
|
(u'Deportes', u'http://www.elcorreo.com/vizcaya/rss/feeds/deportes.xml'),
|
||||||
|
(u'Sociedad', u'http://www.elcorreo.com/vizcaya/rss/feeds/sociedad.xml'),
|
||||||
|
(u'Cultura', u'http://www.elcorreo.com/vizcaya/rss/feeds/cultura.xml'),
|
||||||
|
(u'Televisi\xf3n', u'http://www.elcorreo.com/vizcaya/rss/feeds/television.xml'),
|
||||||
|
(u'Gente', u'http://www.elcorreo.com/vizcaya/rss/feeds/gente.xml')
|
||||||
|
]
|
||||||
|
|
||||||
|
keep_only_tags = [
|
||||||
|
dict(name='div', attrs={'class':['grouphead','date','art_head','story-texto','text','colC_articulo','contenido_comentarios']}),
|
||||||
|
dict(name='div' , attrs={'id':['articulo','story-texto','story-entradilla']})
|
||||||
|
]
|
||||||
|
|
||||||
|
remove_tags = [
|
||||||
|
dict(name='div', attrs={'class':['art_barra','detalles-opinion','formdenunciar','modulo calculadoras','nubetags','pie']}),
|
||||||
|
dict(name='div', attrs={'class':['mod_lomas','bloque_lomas','blm_header','link-app3','link-app4','botones_listado']}),
|
||||||
|
dict(name='div', attrs={'class':['navegacion_galeria','modulocanalpromocion','separa','separacion','compartir','tags_relacionados']}),
|
||||||
|
dict(name='div', attrs={'class':['moduloBuscadorDeportes','modulo-gente','moddestacadopeq','OpcArt','articulopiniones']}),
|
||||||
|
dict(name='div', attrs={'class':['modulo-especial','publiEspecial']}),
|
||||||
|
dict(name='div', attrs={'id':['articulopina']}),
|
||||||
|
dict(name='br', attrs={'class':'clear'}),
|
||||||
|
dict(name='form', attrs={'name':'frm_conversor2'})
|
||||||
|
]
|
||||||
|
|
||||||
|
remove_tags_before = dict(name='div' , attrs={'class':'articulo '})
|
||||||
|
remove_tags_after = dict(name='div' , attrs={'class':'comentarios'})
|
||||||
|
|
||||||
|
def get_cover_url(self):
|
||||||
|
cover = None
|
||||||
|
st = time.localtime()
|
||||||
|
year = str(st.tm_year)
|
||||||
|
month = "%.2d" % st.tm_mon
|
||||||
|
day = "%.2d" % st.tm_mday
|
||||||
|
#[url]http://img.kiosko.net/2011/01/02/es/elcorreo.750.jpg[/url]
|
||||||
|
#[url]http://info.elcorreo.com/pdf/06012011-viz.pdf[/url]
|
||||||
|
cover='http://info.elcorreo.com/pdf/'+ day + month + year +'-viz.pdf'
|
||||||
|
|
||||||
|
br = BasicNewsRecipe.get_browser()
|
||||||
|
try:
|
||||||
|
br.open(cover)
|
||||||
|
except:
|
||||||
|
self.log("\nPortada no disponible")
|
||||||
|
cover ='http://www.elcorreo.com/vizcaya/noticias/201002/02/Media/logo-elcorreo-nuevo.png'
|
||||||
|
return cover
|
||||||
|
|
||||||
|
extra_css = '''
|
||||||
|
h1, .headline {font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:30px;}
|
||||||
|
h2, .subhead {font-family:Arial,Helvetica,sans-serif; font-style:italic; font-weight:normal;font-size:18px;}
|
||||||
|
h3, .overhead {font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:16px;}
|
||||||
|
h4 {font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:16px;}
|
||||||
|
h5 {font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:16px;}
|
||||||
|
h6 {font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:16px;}
|
||||||
|
.date,.byline, .photo {font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:14px;}
|
||||||
|
img{margin-bottom: 0.4em}
|
||||||
|
'''
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
preprocess_regexps = [
|
||||||
|
|
||||||
|
# To present the image of the embedded video
|
||||||
|
(re.compile(r'var RUTA_IMAGEN', re.DOTALL|re.IGNORECASE), lambda match: '</script><img src'),
|
||||||
|
(re.compile(r'.jpg";', re.DOTALL|re.IGNORECASE), lambda match: '.jpg">'),
|
||||||
|
(re.compile(r'var SITIO = "elcorreo";', re.DOTALL|re.IGNORECASE), lambda match: '<SCRIPT TYPE="text/JavaScript"'),
|
||||||
|
|
||||||
|
# To separate paragraphs with a blank line
|
||||||
|
(re.compile(r'<div class="p"', re.DOTALL|re.IGNORECASE), lambda match: '<p></p><div class="p"'),
|
||||||
|
|
||||||
|
# To put a blank line between the subtitle and the date and time of the news
|
||||||
|
(re.compile(r'<div class="date">', re.DOTALL|re.IGNORECASE), lambda match: '<br><div class="date">'),
|
||||||
|
|
||||||
|
# To put a blank line between the intro of the embedded videos and the previous text
|
||||||
|
(re.compile(r'<div class="video"', re.DOTALL|re.IGNORECASE), lambda match: '<br><div class="video"'),
|
||||||
|
|
||||||
|
# To view photos from the first when these are presented as a gallery
|
||||||
|
(re.compile(r'src="/img/shim.gif"', re.DOTALL|re.IGNORECASE), lambda match: ''),
|
||||||
|
(re.compile(r'rel=', re.DOTALL|re.IGNORECASE), lambda match: 'src='),
|
||||||
|
|
||||||
|
# To remove the link of the title
|
||||||
|
(re.compile(r'<h1 class="headline">\n<a href="', re.DOTALL|re.IGNORECASE), lambda match: '<h1 class="'),
|
||||||
|
(re.compile(r'</a>\n</h1>', re.DOTALL|re.IGNORECASE), lambda match: '</h1>'),
|
||||||
|
|
||||||
|
]
|
||||||
|
|
@ -9,13 +9,14 @@ __docformat__ = 'restructuredtext en'
|
|||||||
elpais.es
|
elpais.es
|
||||||
'''
|
'''
|
||||||
|
|
||||||
|
from time import strftime
|
||||||
|
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
class ElPais(BasicNewsRecipe):
|
class ElPais(BasicNewsRecipe):
|
||||||
__author__ = 'Kovid Goyal & Lorenzo Vigentini & Jordi Balcells'
|
__author__ = 'Kovid Goyal & Lorenzo Vigentini & Jordi Balcells'
|
||||||
description = 'Main daily newspaper from Spain'
|
description = 'Main daily newspaper from Spain'
|
||||||
|
|
||||||
cover_url = 'http://www.elpais.com/im/tit_logo_global.gif'
|
|
||||||
title = u'El Pais'
|
title = u'El Pais'
|
||||||
publisher = u'Ediciones El Pa\xeds SL'
|
publisher = u'Ediciones El Pa\xeds SL'
|
||||||
category = 'News, politics, culture, economy, general interest'
|
category = 'News, politics, culture, economy, general interest'
|
||||||
@ -62,6 +63,6 @@ class ElPais(BasicNewsRecipe):
|
|||||||
(u'Vi\xf1etas', u'http://www.elpais.com/rss/feed.html?feedId=17058')
|
(u'Vi\xf1etas', u'http://www.elpais.com/rss/feed.html?feedId=17058')
|
||||||
]
|
]
|
||||||
|
|
||||||
def print_version(self, url):
|
def get_cover_url(self):
|
||||||
url = url+'?print=1'
|
return 'http://img5.kiosko.net/' + strftime("%Y/%m/%d") + '/es/elpais.750.jpg'
|
||||||
return url
|
|
||||||
|
43
resources/recipes/el_publico.recipe
Normal file
43
resources/recipes/el_publico.recipe
Normal file
@ -0,0 +1,43 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
__license__ = 'GPL v3'
|
||||||
|
__author__ = 'Gerardo Diez'
|
||||||
|
__copyright__ = 'Gerardo Diez<gerardo.diez.garcia@gmail.com>'
|
||||||
|
description = 'Main daily newspaper from Spain - v1.00 (05, Enero 2011)'
|
||||||
|
__docformat__ = 'restructuredtext en'
|
||||||
|
|
||||||
|
'''
|
||||||
|
publico.es
|
||||||
|
'''
|
||||||
|
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||||
|
class Publico(BasicNewsRecipe):
|
||||||
|
title =u'Publico.es'
|
||||||
|
__author__ ='Gerardo Diez'
|
||||||
|
publisher =u'Mediapubli Sociedad de Publicaciones y Ediciones S.L.'
|
||||||
|
category ='news, politics, finances, world, spain, science, catalunya'
|
||||||
|
oldest_article =1
|
||||||
|
max_articles_per_feed =100
|
||||||
|
simultaneous_downloads =10
|
||||||
|
cover_url =u'http://imagenes.publico.es/css/img/logo_publico.gif'
|
||||||
|
timefmt ='[%a, %d %b, %Y]'
|
||||||
|
encoding ='utf8'
|
||||||
|
language ='es'
|
||||||
|
remove_javascript =True
|
||||||
|
no_stylesheets =True
|
||||||
|
keep_only_tags =dict(id='main')
|
||||||
|
remove_tags =[
|
||||||
|
dict(name='div', attrs={'class':['Noticias_642x50', 'contInfo ancho']}),
|
||||||
|
dict(name='ul', attrs={'class':['navComentarios', 'comentarios']}),
|
||||||
|
dict(name='div', attrs={'id':['commentsContext', 'toolbar', 'comentarios']}),
|
||||||
|
dict(name='h5', attrs={'id':'comentarios'})
|
||||||
|
]
|
||||||
|
feeds =[(u'Internacional', u'http://www.publico.es/estaticos/rss/internacional'),
|
||||||
|
(u'Espa\xf1a', u'http://www.publico.es/estaticos/rss/espana'),
|
||||||
|
(u'Dinero', u'http://www.publico.es/estaticos/rss/dinero'),
|
||||||
|
(u'Ciencias', u'http://www.publico.es/estaticos/rss/ciencias'),
|
||||||
|
(u'Culturas', u'http://www.publico.es/estaticos/rss/culturas'),
|
||||||
|
(u'Deportes', u'http://www.publico.es/estaticos/rss/deportes'),
|
||||||
|
(u'Televisi\xf3n y Gente', u'http://www.publico.es/estaticos/rss/televisionygente'),
|
||||||
|
(u'Catalu\xf1a', u'http://www.publico.es/estaticos/rss/catalunya'),
|
||||||
|
(u'Viajes', u'http://www.publico.es/estaticos/rss/viajes')]
|
||||||
|
|
||||||
|
|
@ -17,7 +17,7 @@ class ElPais_RSS(BasicNewsRecipe):
|
|||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
encoding = 'cp1252'
|
encoding = 'cp1252'
|
||||||
use_embedded_content = False
|
use_embedded_content = False
|
||||||
language = 'es_ES'
|
language = 'es'
|
||||||
remove_empty_feeds = True
|
remove_empty_feeds = True
|
||||||
publication_type = 'newspaper'
|
publication_type = 'newspaper'
|
||||||
masthead_url = 'http://www.elpais.com/im/tit_logo.gif'
|
masthead_url = 'http://www.elpais.com/im/tit_logo.gif'
|
||||||
|
@ -1,7 +1,5 @@
|
|||||||
#!/usr/bin/env python
|
|
||||||
|
|
||||||
__license__ = 'GPL v3'
|
__license__ = 'GPL v3'
|
||||||
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
|
__copyright__ = '2009-2011, Darko Miletic <darko.miletic at gmail.com>'
|
||||||
'''
|
'''
|
||||||
exiledonline.com
|
exiledonline.com
|
||||||
'''
|
'''
|
||||||
@ -21,17 +19,19 @@ class Exiled(BasicNewsRecipe):
|
|||||||
encoding = 'utf8'
|
encoding = 'utf8'
|
||||||
remove_javascript = True
|
remove_javascript = True
|
||||||
language = 'en'
|
language = 'en'
|
||||||
|
publication_type = 'newsblog'
|
||||||
|
masthead_url = 'http://exiledonline.com/wp-content/themes/exiledonline_theme/images/header-sm.gif'
|
||||||
|
extra_css = """
|
||||||
|
body{font-family: Arial,Helvetica,sans-serif}
|
||||||
|
#topslug{font-size: xx-large; font-weight: bold; color: red}
|
||||||
|
"""
|
||||||
|
|
||||||
cover_url = 'http://exiledonline.com/wp-content/themes/exiledonline_theme/images/header-sm.gif'
|
conversion_options = {
|
||||||
|
'comment' : description
|
||||||
html2lrf_options = [
|
, 'tags' : category
|
||||||
'--comment' , description
|
, 'publisher' : publisher
|
||||||
, '--base-font-size', '10'
|
, 'language' : language
|
||||||
, '--category' , category
|
}
|
||||||
, '--publisher' , publisher
|
|
||||||
]
|
|
||||||
|
|
||||||
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
|
|
||||||
|
|
||||||
keep_only_tags = [dict(name='div', attrs={'id':'main'})]
|
keep_only_tags = [dict(name='div', attrs={'id':'main'})]
|
||||||
|
|
||||||
@ -47,12 +47,13 @@ class Exiled(BasicNewsRecipe):
|
|||||||
def preprocess_html(self, soup):
|
def preprocess_html(self, soup):
|
||||||
for item in soup.findAll(style=True):
|
for item in soup.findAll(style=True):
|
||||||
del item['style']
|
del item['style']
|
||||||
mtag = '\n<meta http-equiv="Content-Language" content="en"/>\n<meta http-equiv="Content-Type" content="text/html; charset=utf-8">\n'
|
for alink in soup.findAll('a'):
|
||||||
soup.head.insert(0,mtag)
|
if alink.string is not None:
|
||||||
|
tstr = alink.string
|
||||||
|
alink.replaceWith(tstr)
|
||||||
return soup
|
return soup
|
||||||
|
|
||||||
def get_article_url(self, article):
|
def get_article_url(self, article):
|
||||||
raw = article.get('link', None)
|
raw = article.get('link', None)
|
||||||
final = raw + 'all/1/'
|
final = raw + 'all/1/'
|
||||||
return final
|
return final
|
||||||
|
|
||||||
|
@ -1,59 +1,79 @@
|
|||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
# -*- coding: utf-8 -*-
|
|
||||||
|
|
||||||
__license__ = 'GPL v3'
|
__license__ = 'GPL v3'
|
||||||
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
|
__author__ = 'Gerardo Diez'
|
||||||
|
__copyright__ = 'Gerardo Diez<gerardo.diez.garcia@gmail.com>'
|
||||||
|
description = 'Main daily newspaper from Spain - v1.00 (05, Enero 2011)'
|
||||||
|
__docformat__ = 'restructuredtext en'
|
||||||
|
|
||||||
'''
|
'''
|
||||||
www.expansion.com
|
expansion.es
|
||||||
'''
|
'''
|
||||||
|
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||||
|
class Publico(BasicNewsRecipe):
|
||||||
|
title =u'Expansion.com'
|
||||||
|
__author__ ='Gerardo Diez'
|
||||||
|
publisher =u'Unidad Editorial Información Económica, S.L.'
|
||||||
|
category ='finances, catalunya'
|
||||||
|
oldest_article =1
|
||||||
|
max_articles_per_feed =100
|
||||||
|
simultaneous_downloads =10
|
||||||
|
cover_url =u'http://estaticos01.expansion.com/iconos/v2.x/v2.0/cabeceras/logo_expansion.png'
|
||||||
|
timefmt ='[%A, %d %B, %Y]'
|
||||||
|
encoding ='latin'
|
||||||
|
language ='es'
|
||||||
|
remove_javascript =True
|
||||||
|
no_stylesheets =True
|
||||||
|
keep_only_tags =dict(name='div', attrs={'class':['noticia primer_elemento']})
|
||||||
|
remove_tags =[
|
||||||
|
dict(name='div', attrs={'class':['compartir', 'metadata_desarrollo_noticia', 'relacionadas', 'mas_info','publicidad publicidad_textlink', 'ampliarfoto']}),
|
||||||
|
dict(name='ul', attrs={'class':['bolos_desarrollo_noticia']}),
|
||||||
|
dict(name='span', attrs={'class':['comentarios']}),
|
||||||
|
dict(name='p', attrs={'class':['cintillo_comentarios', 'cintillo_comentarios formulario']}),
|
||||||
|
dict(name='div', attrs={'id':['comentarios_lectores_listado']})
|
||||||
|
]
|
||||||
|
feeds =[
|
||||||
|
(u'Portada', u'http://estaticos.expansion.com/rss/portada.xml'),
|
||||||
|
(u'Portada: Bolsas', u'http://estaticos.expansion.com/rss/mercados.xml'),
|
||||||
|
(u'Divisas', u'http://estaticos.expansion.com/rss/mercadosdivisas.xml'),
|
||||||
|
(u'Euribor', u'http://estaticos.expansion.com/rss/mercadoseuribor.xml'),
|
||||||
|
(u'Materias Primas', u'http://estaticos.expansion.com/rss/mercadosmateriasprimas.xml'),
|
||||||
|
(u'Renta Fija', u'http://estaticos.expansion.com/rss/mercadosrentafija.xml'),
|
||||||
|
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
(u'Portada: Mi Dinero', u'http://estaticos.expansion.com/rss/midinero.xml'),
|
||||||
from calibre.ebooks.BeautifulSoup import Tag
|
(u'Hipotecas', u'http://estaticos.expansion.com/rss/midinerohipotecas.xml'),
|
||||||
|
(u'Créditos', u'http://estaticos.expansion.com/rss/midinerocreditos.xml'),
|
||||||
|
(u'Pensiones', u'http://estaticos.expansion.com/rss/midineropensiones.xml'),
|
||||||
|
(u'Fondos de Inversión', u'http://estaticos.expansion.com/rss/midinerofondos.xml'),
|
||||||
|
(u'Motor', u'http://estaticos.expansion.com/rss/midineromotor.xml'),
|
||||||
|
|
||||||
class Expansion(BasicNewsRecipe):
|
(u'Portada: Empresas', u'http://estaticos.expansion.com/rss/empresas.xml'),
|
||||||
title = 'Diario Expansion'
|
(u'Banca', u'http://estaticos.expansion.com/rss/empresasbanca.xml'),
|
||||||
__author__ = 'Darko Miletic'
|
(u'TMT', u'http://estaticos.expansion.com/rss/empresastmt.xml'),
|
||||||
description = 'Lider de informacion de mercados, economica y politica'
|
(u'Energía', u'http://estaticos.expansion.com/rss/empresasenergia.xml'),
|
||||||
publisher = 'expansion.com'
|
(u'Inmobiliario y Construcción', u'http://estaticos.expansion.com/rss/empresasinmobiliario.xml'),
|
||||||
category = 'news, politics, Spain'
|
(u'Transporte y Turismo', u'http://estaticos.expansion.com/rss/empresastransporte.xml'),
|
||||||
oldest_article = 2
|
(u'Automoción e Industria', u'http://estaticos.expansion.com/rss/empresasauto-industria.xml'),
|
||||||
max_articles_per_feed = 100
|
(u'Distribución', u'http://estaticos.expansion.com/rss/empresasdistribucion.xml'),
|
||||||
no_stylesheets = True
|
(u'Deporte y Negocio', u' http://estaticos.expansion.com/rss/empresasdeporte.xml'),
|
||||||
use_embedded_content = False
|
(u'Mi Negocio', u'http://estaticos.expansion.com/rss/empresasminegocio.xml'),
|
||||||
delay = 1
|
(u'Interiores', u'http://estaticos.expansion.com/rss/empresasinteriores.xml'),
|
||||||
encoding = 'iso-8859-15'
|
(u'Digitech', u'http://estaticos.expansion.com/rss/empresasdigitech.xml'),
|
||||||
language = 'es'
|
|
||||||
|
|
||||||
direction = 'ltr'
|
(u'Portada: Economía y Política', u'http://estaticos.expansion.com/rss/economiapolitica.xml'),
|
||||||
|
(u'Política', u'http://estaticos.expansion.com/rss/economia.xml'),
|
||||||
|
(u'Portada: Sociedad', u'http://estaticos.expansion.com/rss/entorno.xml'),
|
||||||
|
|
||||||
html2lrf_options = [
|
(u'Portada: Opinión', u'http://estaticos.expansion.com/rss/opinion.xml'),
|
||||||
'--comment' , description
|
(u'Llaves y editoriales', u'http://estaticos.expansion.com/rss/opinioneditorialyllaves.xml'),
|
||||||
, '--category' , category
|
(u'Tribunas', u'http://estaticos.expansion.com/rss/opiniontribunas.xml'),
|
||||||
, '--publisher', publisher
|
|
||||||
|
(u'Portada: Jurídico', u'http://estaticos.expansion.com/rss/juridico.xml'),
|
||||||
|
(u'Entrevistas', u'http://estaticos.expansion.com/rss/juridicoentrevistas.xml'),
|
||||||
|
(u'Opinión', u'http://estaticos.expansion.com/rss/juridicoopinion.xml'),
|
||||||
|
(u'Sentencias', u'http://estaticos.expansion.com/rss/juridicosentencias.xml'),
|
||||||
|
|
||||||
|
(u'Mujer', u'http://estaticos.expansion.com/rss/mujer-empresa.xml'),
|
||||||
|
(u'Cataluña', u'http://estaticos.expansion.com/rss/catalunya.xml'),
|
||||||
|
(u'Función pública', u'http://estaticos.expansion.com/rss/funcion-publica.xml')
|
||||||
]
|
]
|
||||||
|
|
||||||
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
|
|
||||||
|
|
||||||
feeds = [
|
|
||||||
(u'Ultimas noticias', u'http://rss.expansion.com/rss/descarga.htm?data2=178')
|
|
||||||
,(u'Temas del dia' , u'http://rss.expansion.com/rss/descarga.htm?data2=178')
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
keep_only_tags = [dict(name='div', attrs={'id':'principal'})]
|
|
||||||
|
|
||||||
remove_tags = [
|
|
||||||
dict(name=['object','link','script'])
|
|
||||||
,dict(name='div', attrs={'class':['utilidades','tit_relacionadas']})
|
|
||||||
]
|
|
||||||
|
|
||||||
remove_tags_after = [dict(name='div', attrs={'class':'tit_relacionadas'})]
|
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
|
||||||
soup.html['dir' ] = self.direction
|
|
||||||
mcharset = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset=utf-8")])
|
|
||||||
soup.head.insert(0,mcharset)
|
|
||||||
for item in soup.findAll(style=True):
|
|
||||||
del item['style']
|
|
||||||
return soup
|
|
||||||
|
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
__license__ = 'GPL v3'
|
__license__ = 'GPL v3'
|
||||||
__copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>'
|
__copyright__ = '2010-2011, Darko Miletic <darko.miletic at gmail.com>'
|
||||||
'''
|
'''
|
||||||
ft.com
|
ft.com
|
||||||
'''
|
'''
|
||||||
@ -52,12 +52,9 @@ class FinancialTimes(BasicNewsRecipe):
|
|||||||
.copyright{font-size: x-small}
|
.copyright{font-size: x-small}
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def parse_index(self):
|
def get_artlinks(self, elem):
|
||||||
articles = []
|
articles = []
|
||||||
soup = self.index_to_soup(self.INDEX)
|
for item in elem.findAll('a',href=True):
|
||||||
wide = soup.find('div',attrs={'class':'wide'})
|
|
||||||
if wide:
|
|
||||||
for item in wide.findAll('a',href=True):
|
|
||||||
url = self.PREFIX + item['href']
|
url = self.PREFIX + item['href']
|
||||||
title = self.tag_to_string(item)
|
title = self.tag_to_string(item)
|
||||||
date = strftime(self.timefmt)
|
date = strftime(self.timefmt)
|
||||||
@ -67,7 +64,26 @@ class FinancialTimes(BasicNewsRecipe):
|
|||||||
,'url' :url
|
,'url' :url
|
||||||
,'description':''
|
,'description':''
|
||||||
})
|
})
|
||||||
return [('FT UK edition',articles)]
|
return articles
|
||||||
|
|
||||||
|
def parse_index(self):
|
||||||
|
feeds = []
|
||||||
|
soup = self.index_to_soup(self.INDEX)
|
||||||
|
wide = soup.find('div',attrs={'class':'wide'})
|
||||||
|
if not wide:
|
||||||
|
return feeds
|
||||||
|
strest = wide.findAll('h3', attrs={'class':'section'})
|
||||||
|
if not strest:
|
||||||
|
return feeds
|
||||||
|
st = wide.find('h4',attrs={'class':'section-no-arrow'})
|
||||||
|
if st:
|
||||||
|
strest.insert(0,st)
|
||||||
|
for item in strest:
|
||||||
|
ftitle = self.tag_to_string(item)
|
||||||
|
self.report_progress(0, _('Fetching feed')+' %s...'%(ftitle))
|
||||||
|
feedarts = self.get_artlinks(item.parent.ul)
|
||||||
|
feeds.append((ftitle,feedarts))
|
||||||
|
return feeds
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
def preprocess_html(self, soup):
|
||||||
return self.adeify_images(soup)
|
return self.adeify_images(soup)
|
||||||
|
@ -1,4 +1,5 @@
|
|||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
from calibre.ebooks.BeautifulSoup import Tag
|
||||||
import re
|
import re
|
||||||
|
|
||||||
class NatureNews(BasicNewsRecipe):
|
class NatureNews(BasicNewsRecipe):
|
||||||
@ -10,17 +11,76 @@ class NatureNews(BasicNewsRecipe):
|
|||||||
max_articles_per_feed = 50
|
max_articles_per_feed = 50
|
||||||
|
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
remove_tags_before = dict(name='h1', attrs={'class':'heading entry-title'})
|
keep_only_tags = [dict(name='div', attrs={'id':'content'})]
|
||||||
remove_tags_after = dict(name='h2', attrs={'id':'comments'})
|
# remove_tags_before = dict(name='h1', attrs={'class':'heading entry-title'})
|
||||||
|
# remove_tags_after = dict(name='h2', attrs={'id':'comments'})
|
||||||
remove_tags = [
|
remove_tags = [
|
||||||
dict(name='h2', attrs={'id':'comments'}),
|
dict(name='h2', attrs={'id':'comments'}),
|
||||||
dict(attrs={'alt':'Advertisement'}),
|
dict(attrs={'alt':'Advertisement'}),
|
||||||
dict(name='div', attrs={'class':'ad'}),
|
dict(name='div', attrs={'class':'ad'}),
|
||||||
|
dict(attrs={'class':'Z3988'}),
|
||||||
|
dict(attrs={'class':['formatpublished','type-of-article','cleardiv','disclaimer','buttons','comments xoxo']}),
|
||||||
|
dict(name='a', attrs={'href':'#comments'}),
|
||||||
|
dict(name='h2',attrs={'class':'subheading plusicon icon-add-comment'})
|
||||||
]
|
]
|
||||||
|
|
||||||
preprocess_regexps = [
|
preprocess_regexps = [
|
||||||
(re.compile(r'<p>ADVERTISEMENT</p>', re.DOTALL|re.IGNORECASE), lambda match: ''),
|
(re.compile(r'<p>ADVERTISEMENT</p>', re.DOTALL|re.IGNORECASE), lambda match: ''),
|
||||||
]
|
]
|
||||||
|
|
||||||
|
extra_css = '''
|
||||||
|
.author { text-align: right; font-size: small; line-height:1em; margin-top:0px; margin-left:0; margin-right:0; margin-bottom: 0; }
|
||||||
|
.imagedescription { font-size: small; font-style:italic; line-height:1em; margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; }
|
||||||
|
.imagecredit { font-size: x-small; font-style: normal; font-weight: bold}
|
||||||
|
'''
|
||||||
|
|
||||||
feeds = [('Nature News', 'http://feeds.nature.com/news/rss/most_recent')]
|
feeds = [('Nature News', 'http://feeds.nature.com/news/rss/most_recent')]
|
||||||
|
|
||||||
|
def preprocess_html(self,soup):
|
||||||
|
# The author name is slightly buried - dig it up
|
||||||
|
author = soup.find('p', {'class':'byline'})
|
||||||
|
if author:
|
||||||
|
# Find out the author's name
|
||||||
|
authornamediv = author.find('span',{'class':'author fn'})
|
||||||
|
authornamelink = authornamediv.find('a')
|
||||||
|
if authornamelink:
|
||||||
|
authorname = authornamelink.contents[0]
|
||||||
|
else:
|
||||||
|
authorname = authornamediv.contents[0]
|
||||||
|
# Stick the author's name in the byline tag
|
||||||
|
tag = Tag(soup,'div')
|
||||||
|
tag['class'] = 'author'
|
||||||
|
tag.insert(0,authorname.strip())
|
||||||
|
author.replaceWith(tag)
|
||||||
|
|
||||||
|
# Change the intro from a p to a div
|
||||||
|
intro = soup.find('p',{'class':'intro'})
|
||||||
|
if intro:
|
||||||
|
tag = Tag(soup,'div')
|
||||||
|
tag['class'] = 'intro'
|
||||||
|
tag.insert(0,intro.contents[0])
|
||||||
|
intro.replaceWith(tag)
|
||||||
|
|
||||||
|
# Change span class=imagedescription to div
|
||||||
|
descr = soup.find('span',{'class':'imagedescription'})
|
||||||
|
if descr:
|
||||||
|
tag = Tag(soup,'div')
|
||||||
|
tag['class'] = 'imagedescription'
|
||||||
|
tag.insert(0,descr.renderContents())
|
||||||
|
descr.replaceWith(tag)
|
||||||
|
|
||||||
|
# The references are in a list, let's make them simpler
|
||||||
|
reflistcont = soup.find('ul',{'id':'article-refrences'})
|
||||||
|
if reflistcont:
|
||||||
|
reflist = reflistcont.li.renderContents()
|
||||||
|
tag = Tag(soup,'div')
|
||||||
|
tag['class'] = 'article-references'
|
||||||
|
tag.insert(0,reflist)
|
||||||
|
reflistcont.replaceWith(tag)
|
||||||
|
|
||||||
|
# Within the id=content div, we need to remove all the stuff after the end of the class=entry-content
|
||||||
|
entrycontent = soup.find('div',{'class':'entry-content'})
|
||||||
|
for nextSibling in entrycontent.findNextSiblings():
|
||||||
|
nextSibling.extract()
|
||||||
|
|
||||||
|
return soup
|
||||||
|
@ -8,12 +8,13 @@ __docformat__ = 'restructuredtext en'
|
|||||||
globeandmail.com
|
globeandmail.com
|
||||||
'''
|
'''
|
||||||
|
|
||||||
|
import re
|
||||||
|
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
class AdvancedUserRecipe1287083651(BasicNewsRecipe):
|
class AdvancedUserRecipe1287083651(BasicNewsRecipe):
|
||||||
title = u'Globe & Mail'
|
title = u'Globe & Mail'
|
||||||
__license__ = 'GPL v3'
|
__author__ = 'Kovid Goyal'
|
||||||
__author__ = 'Szing'
|
|
||||||
oldest_article = 2
|
oldest_article = 2
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
max_articles_per_feed = 100
|
max_articles_per_feed = 100
|
||||||
@ -38,24 +39,19 @@ class AdvancedUserRecipe1287083651(BasicNewsRecipe):
|
|||||||
(u'Sports', u'http://www.theglobeandmail.com/auto/?service=rss')
|
(u'Sports', u'http://www.theglobeandmail.com/auto/?service=rss')
|
||||||
]
|
]
|
||||||
|
|
||||||
keep_only_tags = [
|
preprocess_regexps = [
|
||||||
dict(name='h1'),
|
(re.compile(r'<head.*?</head>', re.DOTALL), lambda m: '<head></head>'),
|
||||||
dict(name='h2', attrs={'id':'articletitle'}),
|
(re.compile(r'<script.*?</script>', re.DOTALL), lambda m: ''),
|
||||||
dict(name='p', attrs={'class':['leadText', 'meta', 'leadImage', 'redtext byline', 'bodyText']}),
|
|
||||||
dict(name='div', attrs={'class':['news','articlemeta','articlecopy']}),
|
|
||||||
dict(name='id', attrs={'class':'article'}),
|
|
||||||
dict(name='table', attrs={'class':'todays-market'}),
|
|
||||||
dict(name='header', attrs={'id':'leadheader'})
|
|
||||||
]
|
]
|
||||||
|
|
||||||
|
remove_tags_before = dict(name='h1')
|
||||||
remove_tags = [
|
remove_tags = [
|
||||||
dict(name='div', attrs={'id':['tabInside', 'ShareArticles', 'topStories']})
|
dict(name='div', attrs={'id':['ShareArticles', 'topStories']}),
|
||||||
]
|
dict(href=lambda x: x and 'tracking=' in x),
|
||||||
|
{'class':['articleTools', 'pagination', 'Ads', 'topad',
|
||||||
#this has to be here or the text in the article appears twice.
|
'breadcrumbs', 'footerNav', 'footerUtil', 'downloadlinks']}]
|
||||||
remove_tags_after = [dict(id='article')]
|
|
||||||
|
|
||||||
#Use the mobile version rather than the web version
|
#Use the mobile version rather than the web version
|
||||||
def print_version(self, url):
|
def print_version(self, url):
|
||||||
return url + '&service=mobile'
|
return url.rpartition('?')[0] + '?service=mobile'
|
||||||
|
|
||||||
|
64
resources/recipes/gulfnews.recipe
Normal file
64
resources/recipes/gulfnews.recipe
Normal file
@ -0,0 +1,64 @@
|
|||||||
|
__license__ = 'GPL v3'
|
||||||
|
__copyright__ = '2011, Darko Miletic <darko.miletic at gmail.com>'
|
||||||
|
'''
|
||||||
|
gulfnews.com
|
||||||
|
'''
|
||||||
|
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class GulfNews(BasicNewsRecipe):
|
||||||
|
title = 'Gulf News'
|
||||||
|
__author__ = 'Darko Miletic'
|
||||||
|
description = 'News from United Arab Emirrates, persian gulf and rest of the world'
|
||||||
|
publisher = 'Al Nisr Publishing LLC'
|
||||||
|
category = 'news, politics, UAE, world'
|
||||||
|
oldest_article = 2
|
||||||
|
max_articles_per_feed = 200
|
||||||
|
no_stylesheets = True
|
||||||
|
encoding = 'utf8'
|
||||||
|
use_embedded_content = False
|
||||||
|
language = 'en'
|
||||||
|
remove_empty_feeds = True
|
||||||
|
publication_type = 'newsportal'
|
||||||
|
masthead_url = 'http://gulfnews.com/media/img/gulf_news_logo.jpg'
|
||||||
|
extra_css = """
|
||||||
|
body{font-family: Arial,Helvetica,sans-serif }
|
||||||
|
img{margin-bottom: 0.4em; display:block}
|
||||||
|
h1{font-family: Georgia, 'Times New Roman', Times, serif}
|
||||||
|
ol,ul{list-style: none}
|
||||||
|
.synopsis{font-size: small}
|
||||||
|
.details{font-size: x-small}
|
||||||
|
.image{font-size: xx-small}
|
||||||
|
"""
|
||||||
|
|
||||||
|
conversion_options = {
|
||||||
|
'comment' : description
|
||||||
|
, 'tags' : category
|
||||||
|
, 'publisher' : publisher
|
||||||
|
, 'language' : language
|
||||||
|
}
|
||||||
|
|
||||||
|
remove_tags = [
|
||||||
|
dict(name=['meta','link','object','embed'])
|
||||||
|
,dict(attrs={'class':['quickLinks','ratings']})
|
||||||
|
,dict(attrs={'id':'imageSelector'})
|
||||||
|
]
|
||||||
|
remove_attributes=['lang']
|
||||||
|
keep_only_tags=[
|
||||||
|
dict(name='h1')
|
||||||
|
,dict(attrs={'class':['synopsis','details','image','article']})
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
feeds = [
|
||||||
|
(u'UAE News' , u'http://gulfnews.com/cmlink/1.446094')
|
||||||
|
,(u'Business' , u'http://gulfnews.com/cmlink/1.446098')
|
||||||
|
,(u'Entertainment' , u'http://gulfnews.com/cmlink/1.446095')
|
||||||
|
,(u'Sport' , u'http://gulfnews.com/cmlink/1.446096')
|
||||||
|
,(u'Life' , u'http://gulfnews.com/cmlink/1.446097')
|
||||||
|
]
|
||||||
|
|
||||||
|
def preprocess_html(self, soup):
|
||||||
|
for item in soup.findAll(style=True):
|
||||||
|
del item['style']
|
||||||
|
return soup
|
@ -3,13 +3,14 @@ __license__ = 'GPL v3'
|
|||||||
__copyright__ = '04 December 2010, desUBIKado'
|
__copyright__ = '04 December 2010, desUBIKado'
|
||||||
__author__ = 'desUBIKado'
|
__author__ = 'desUBIKado'
|
||||||
__description__ = 'Daily newspaper from Aragon'
|
__description__ = 'Daily newspaper from Aragon'
|
||||||
__version__ = 'v0.03'
|
__version__ = 'v0.04'
|
||||||
__date__ = '11, December 2010'
|
__date__ = '6, Januery 2011'
|
||||||
'''
|
'''
|
||||||
[url]http://www.heraldo.es/[/url]
|
[url]http://www.heraldo.es/[/url]
|
||||||
'''
|
'''
|
||||||
|
|
||||||
import time
|
import time
|
||||||
|
import re
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
class heraldo(BasicNewsRecipe):
|
class heraldo(BasicNewsRecipe):
|
||||||
@ -20,12 +21,13 @@ class heraldo(BasicNewsRecipe):
|
|||||||
category = 'News, politics, culture, economy, general interest'
|
category = 'News, politics, culture, economy, general interest'
|
||||||
language = 'es'
|
language = 'es'
|
||||||
timefmt = '[%a, %d %b, %Y]'
|
timefmt = '[%a, %d %b, %Y]'
|
||||||
oldest_article = 1
|
oldest_article = 2
|
||||||
|
delay = 1
|
||||||
max_articles_per_feed = 100
|
max_articles_per_feed = 100
|
||||||
use_embedded_content = False
|
use_embedded_content = False
|
||||||
remove_javascript = True
|
remove_javascript = True
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
recursion = 10
|
|
||||||
|
|
||||||
feeds = [
|
feeds = [
|
||||||
(u'Portadas', u'http://www.heraldo.es/index.php/mod.portadas/mem.rss')
|
(u'Portadas', u'http://www.heraldo.es/index.php/mod.portadas/mem.rss')
|
||||||
@ -37,7 +39,8 @@ class heraldo(BasicNewsRecipe):
|
|||||||
|
|
||||||
remove_tags = [dict(name='a', attrs={'class':['com flo-r','enl-if','enl-df']}),
|
remove_tags = [dict(name='a', attrs={'class':['com flo-r','enl-if','enl-df']}),
|
||||||
dict(name='div', attrs={'class':['brb-b-s con marg-btt','cnt-rel con']}),
|
dict(name='div', attrs={'class':['brb-b-s con marg-btt','cnt-rel con']}),
|
||||||
dict(name='form', attrs={'class':'form'})]
|
dict(name='form', attrs={'class':'form'}),
|
||||||
|
dict(name='ul', attrs={'id':['cont-tags','pag-1']})]
|
||||||
|
|
||||||
remove_tags_before = dict(name='div' , attrs={'id':'dts'})
|
remove_tags_before = dict(name='div' , attrs={'id':'dts'})
|
||||||
remove_tags_after = dict(name='div' , attrs={'id':'com'})
|
remove_tags_after = dict(name='div' , attrs={'id':'com'})
|
||||||
@ -59,7 +62,16 @@ class heraldo(BasicNewsRecipe):
|
|||||||
return cover
|
return cover
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
extra_css = '''
|
extra_css = '''
|
||||||
h2{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:xx-large;}
|
.con strong{font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:16px;}
|
||||||
|
.con h2{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:30px;}
|
||||||
|
.con span{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:12px;}
|
||||||
|
.ent {font-family:Arial,Helvetica,sans-serif; font-weight:normal; font-style:italic; font-size:18px;}
|
||||||
|
img{margin-bottom: 0.4em}
|
||||||
'''
|
'''
|
||||||
|
|
||||||
|
preprocess_regexps = [
|
||||||
|
|
||||||
|
# To separate the comments with a blank line
|
||||||
|
(re.compile(r'<div id="com"', re.DOTALL|re.IGNORECASE), lambda match: '<br><div id="com"')
|
||||||
|
]
|
||||||
|
@ -5,6 +5,7 @@ class AdvancedUserRecipe1293122276(BasicNewsRecipe):
|
|||||||
__author__ = 'Jack Mason'
|
__author__ = 'Jack Mason'
|
||||||
author = 'IBM Global Business Services'
|
author = 'IBM Global Business Services'
|
||||||
publisher = 'IBM'
|
publisher = 'IBM'
|
||||||
|
language = 'en'
|
||||||
category = 'news, technology, IT, internet of things, analytics'
|
category = 'news, technology, IT, internet of things, analytics'
|
||||||
oldest_article = 7
|
oldest_article = 7
|
||||||
max_articles_per_feed = 30
|
max_articles_per_feed = 30
|
||||||
|
182
resources/recipes/ihned.recipe
Normal file
182
resources/recipes/ihned.recipe
Normal file
@ -0,0 +1,182 @@
|
|||||||
|
import re, time
|
||||||
|
from calibre import strftime
|
||||||
|
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||||
|
|
||||||
|
class IHNed(BasicNewsRecipe):
|
||||||
|
|
||||||
|
|
||||||
|
stahnout_vsechny = True
|
||||||
|
#True = stahuje vsechny z homepage
|
||||||
|
#False = stahuje pouze dnesni clanky (ze dne, kdy je skript spusten)
|
||||||
|
|
||||||
|
title = 'iHNed'
|
||||||
|
__author__ = 'Karel Bílek'
|
||||||
|
language = 'cs'
|
||||||
|
description = 'Zprávy z iHNed.cz'
|
||||||
|
timefmt = ' [%a, %d %b, %Y]'
|
||||||
|
needs_subscription = False
|
||||||
|
remove_tags = [dict(attrs={'class':['borderbottom', 'web', 'foot', 'reklama', 'd-elm d-rellinks', 'd-elm']}),
|
||||||
|
dict(style=['text-align: center;']),
|
||||||
|
dict(id=['r-bfull']),
|
||||||
|
dict(name=['script', 'noscript', 'style'])]
|
||||||
|
encoding = 'windows-1250'
|
||||||
|
no_stylesheets = True
|
||||||
|
remove_tags_before = dict(attrs={'class':'d-nadtit'})
|
||||||
|
remove_tags_after = dict(attrs={'class':'like'})
|
||||||
|
|
||||||
|
conversion_options = {
|
||||||
|
'linearize_tables' : True,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def preprocess_html(self, soup):
|
||||||
|
|
||||||
|
def makeurl(wat):
|
||||||
|
return "http://ihned.cz"+wat;
|
||||||
|
|
||||||
|
for h1 in soup.findAll('h1'):
|
||||||
|
a = h1.find('a')
|
||||||
|
if a:
|
||||||
|
string = a.string
|
||||||
|
if string:
|
||||||
|
soup.a.replaceWith(string)
|
||||||
|
for a in soup.findAll('a', href=True) :
|
||||||
|
cil = str(a['href'])
|
||||||
|
if cil.startswith("/") or cil.startswith("index"):
|
||||||
|
a['href'] = makeurl(cil)
|
||||||
|
return soup
|
||||||
|
|
||||||
|
|
||||||
|
def parse_index(self):
|
||||||
|
|
||||||
|
def makeurl(wat):
|
||||||
|
if wat.startswith("/") or wat.startswith("index"):
|
||||||
|
return "http://ihned.cz"+wat;
|
||||||
|
else:
|
||||||
|
return wat
|
||||||
|
|
||||||
|
|
||||||
|
articles = {} #vysledek, asi
|
||||||
|
key = None #soucasna sekce
|
||||||
|
ans = [] #vsechny sekce
|
||||||
|
|
||||||
|
articles["Hlavní"] = []
|
||||||
|
ans.append("Hlavní")
|
||||||
|
|
||||||
|
was = {}
|
||||||
|
|
||||||
|
def parse_subpage(url, name):
|
||||||
|
articles[name] = []
|
||||||
|
ans.append(name)
|
||||||
|
|
||||||
|
|
||||||
|
soup = self.index_to_soup(url)
|
||||||
|
otvirak = soup.find(True, attrs={'class':['otv']})
|
||||||
|
if otvirak:
|
||||||
|
|
||||||
|
#the code is copypasted here because I don't know python. simple as that.
|
||||||
|
a = otvirak.find('a', href=True)
|
||||||
|
title = self.tag_to_string(a, use_alt=True).strip()
|
||||||
|
txt = otvirak.find(True, attrs={'class':['txt']})
|
||||||
|
description = ''
|
||||||
|
if txt:
|
||||||
|
match = re.match(r'<div class="txt">\s*([^<]*)\s*<a', str(txt), re.L)
|
||||||
|
if match:
|
||||||
|
description = match.group(1)
|
||||||
|
|
||||||
|
pubdate = strftime('%d. %m.')
|
||||||
|
if not title in was:
|
||||||
|
articles[name].append(
|
||||||
|
dict(title=title, url=makeurl(a['href']), date=pubdate,
|
||||||
|
description=description,
|
||||||
|
content=''))
|
||||||
|
|
||||||
|
otv234 = soup.find(True, attrs={'class':['otv234', 'col2a']})
|
||||||
|
if otv234:
|
||||||
|
for ow in otv234.findAll(True, attrs={'class':['ow']}):
|
||||||
|
a = ow.find('a', href=True)
|
||||||
|
title = self.tag_to_string(a, use_alt=True).strip()
|
||||||
|
description=''
|
||||||
|
prx = ow.find(True, attrs={'class':['prx']});
|
||||||
|
if prx:
|
||||||
|
description = str(prx.string)
|
||||||
|
nfo = ow.find(True, attrs={'class':['nfo']});
|
||||||
|
pubdate = ''
|
||||||
|
if nfo:
|
||||||
|
dtime = time.localtime();
|
||||||
|
day = dtime[2]
|
||||||
|
month = dtime[1]
|
||||||
|
|
||||||
|
pubdate = strftime('%d. %m.')
|
||||||
|
|
||||||
|
match = re.search(r'([0-9]*)\.([0-9]*)\.', str(nfo))
|
||||||
|
|
||||||
|
if self.stahnout_vsechny or (int(day) == int(match.group(1)) and int(month) == int(match.group(2))):
|
||||||
|
if not title in was:
|
||||||
|
articles[name].append(
|
||||||
|
dict(title=title, url=makeurl(a['href']), date=pubdate,
|
||||||
|
description=description,
|
||||||
|
content=''))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
soup = self.index_to_soup('http://ihned.cz/')
|
||||||
|
otvirak = soup.find(True, attrs={'class':['otv']})
|
||||||
|
if otvirak:
|
||||||
|
a = otvirak.find('a', href=True)
|
||||||
|
title = self.tag_to_string(a, use_alt=True).strip()
|
||||||
|
txt = otvirak.find(True, attrs={'class':['txt']})
|
||||||
|
description = ''
|
||||||
|
if txt:
|
||||||
|
match = re.match(r'<div class="txt">\s*([^<]*)\s*<a', str(txt), re.L)
|
||||||
|
if match:
|
||||||
|
description = match.group(1)
|
||||||
|
|
||||||
|
pubdate = strftime('%d. %m.')
|
||||||
|
feed = "Hlavní"
|
||||||
|
articles[feed].append(
|
||||||
|
dict(title=title, url=(a['href']), date=pubdate,
|
||||||
|
description=description,
|
||||||
|
content=''))
|
||||||
|
was[title]=1
|
||||||
|
|
||||||
|
otvirak2345 = soup.find(True, attrs={'class':['otv2345']})
|
||||||
|
if otvirak2345:
|
||||||
|
for otv2 in otvirak2345.findAll(True, attrs={'class':['otv2-5']}):
|
||||||
|
a = otv2.find('a', attrs={'class':['tit2']}, href=True)
|
||||||
|
title = self.tag_to_string(a, use_alt=True).strip()
|
||||||
|
description=''
|
||||||
|
span = otv2.find('span');
|
||||||
|
if span:
|
||||||
|
match = re.match(r'<span>\s*([^<]*)\s*<a', str(span), re.L)
|
||||||
|
if match:
|
||||||
|
description = match.group(1)
|
||||||
|
feed = "Hlavní"
|
||||||
|
pubdate = strftime('%d. %m.')
|
||||||
|
articles[feed].append(
|
||||||
|
dict(title=title, url=(a['href']), date=pubdate,
|
||||||
|
description=description,
|
||||||
|
content=''))
|
||||||
|
was[title]=1
|
||||||
|
|
||||||
|
|
||||||
|
parse_subpage("http://komentare.ihned.cz/", "Komentáře")
|
||||||
|
parse_subpage("http://domaci.ihned.cz", "Domácí")
|
||||||
|
parse_subpage("http://ekonomika.ihned.cz", "Ekonomika")
|
||||||
|
parse_subpage("http://zahranicni.ihned.cz/", "Zahraničí");
|
||||||
|
parse_subpage("http://finweb.ihned.cz/", "Finance");
|
||||||
|
parse_subpage("http://digiweb.ihned.cz/", "DigiWeb");
|
||||||
|
parse_subpage("http://kultura.ihned.cz/", "Kultura")
|
||||||
|
parse_subpage("http://sport.ihned.cz/", "Sport");
|
||||||
|
|
||||||
|
#seradi kategorie
|
||||||
|
ans = self.sort_index_by(ans, {'Hlavni':1, 'Domácí':2, 'Ekonomika':5, 'Zahraničí':3, 'Finance':6, 'DigiWeb':7, 'Kultura':8, 'Sport':9, 'Komentáře':4})
|
||||||
|
|
||||||
|
#vrati, ale pouze, kdyz je v kategoriich...
|
||||||
|
ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
|
||||||
|
return ans
|
||||||
|
|
@ -6,6 +6,7 @@ class KANewsRecipe(BasicNewsRecipe):
|
|||||||
description = u'Nachrichten aus Karlsruhe, Deutschland und der Welt.'
|
description = u'Nachrichten aus Karlsruhe, Deutschland und der Welt.'
|
||||||
__author__ = 'tfeld'
|
__author__ = 'tfeld'
|
||||||
lang='de'
|
lang='de'
|
||||||
|
language = 'de'
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
|
|
||||||
oldest_article = 7
|
oldest_article = 7
|
||||||
|
17
resources/recipes/kath_net.recipe
Normal file
17
resources/recipes/kath_net.recipe
Normal file
@ -0,0 +1,17 @@
|
|||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class AdvancedUserRecipe1295262156(BasicNewsRecipe):
|
||||||
|
title = u'kath.net'
|
||||||
|
__author__ = 'Bobus'
|
||||||
|
oldest_article = 7
|
||||||
|
language = 'en'
|
||||||
|
max_articles_per_feed = 100
|
||||||
|
|
||||||
|
feeds = [(u'kath.net', u'http://www.kath.net/2005/xml/index.xml')]
|
||||||
|
|
||||||
|
|
||||||
|
def print_version(self, url):
|
||||||
|
return url+"&print=yes"
|
||||||
|
|
||||||
|
extra_css = 'td.textb {font-size: medium;}'
|
||||||
|
|
@ -3,12 +3,17 @@ from calibre.web.feeds.news import BasicNewsRecipe
|
|||||||
class AdvancedUserRecipe1274742400(BasicNewsRecipe):
|
class AdvancedUserRecipe1274742400(BasicNewsRecipe):
|
||||||
|
|
||||||
title = u'Las Vegas Review Journal'
|
title = u'Las Vegas Review Journal'
|
||||||
__author__ = 'Joel'
|
__author__ = 'Kovid Goyal'
|
||||||
language = 'en'
|
language = 'en'
|
||||||
|
|
||||||
oldest_article = 7
|
oldest_article = 7
|
||||||
|
|
||||||
max_articles_per_feed = 100
|
max_articles_per_feed = 100
|
||||||
|
keep_only_tags = [dict(id='content-main')]
|
||||||
|
remove_tags = [dict(id=['right-col-content', 'trending-topics']),
|
||||||
|
{'class':['ppy-outer']}
|
||||||
|
]
|
||||||
|
no_stylesheets = True
|
||||||
|
|
||||||
feeds = [
|
feeds = [
|
||||||
(u'News', u'http://www.lvrj.com/news.rss'),
|
(u'News', u'http://www.lvrj.com/news.rss'),
|
||||||
|
@ -20,8 +20,8 @@ class LaVanguardia(BasicNewsRecipe):
|
|||||||
max_articles_per_feed = 100
|
max_articles_per_feed = 100
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
use_embedded_content = False
|
use_embedded_content = False
|
||||||
delay = 1
|
delay = 5
|
||||||
encoding = 'cp1252'
|
# encoding = 'cp1252'
|
||||||
language = 'es'
|
language = 'es'
|
||||||
|
|
||||||
direction = 'ltr'
|
direction = 'ltr'
|
||||||
@ -35,7 +35,7 @@ class LaVanguardia(BasicNewsRecipe):
|
|||||||
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
|
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
|
||||||
|
|
||||||
feeds = [
|
feeds = [
|
||||||
(u'Ciudadanos' , u'http://feeds.feedburner.com/lavanguardia/ciudadanos' )
|
(u'Portada' , u'http://feeds.feedburner.com/lavanguardia/home' )
|
||||||
,(u'Cultura' , u'http://feeds.feedburner.com/lavanguardia/cultura' )
|
,(u'Cultura' , u'http://feeds.feedburner.com/lavanguardia/cultura' )
|
||||||
,(u'Deportes' , u'http://feeds.feedburner.com/lavanguardia/deportes' )
|
,(u'Deportes' , u'http://feeds.feedburner.com/lavanguardia/deportes' )
|
||||||
,(u'Economia' , u'http://feeds.feedburner.com/lavanguardia/economia' )
|
,(u'Economia' , u'http://feeds.feedburner.com/lavanguardia/economia' )
|
||||||
@ -45,17 +45,17 @@ class LaVanguardia(BasicNewsRecipe):
|
|||||||
,(u'Internet y tecnologia', u'http://feeds.feedburner.com/lavanguardia/internet' )
|
,(u'Internet y tecnologia', u'http://feeds.feedburner.com/lavanguardia/internet' )
|
||||||
,(u'Motor' , u'http://feeds.feedburner.com/lavanguardia/motor' )
|
,(u'Motor' , u'http://feeds.feedburner.com/lavanguardia/motor' )
|
||||||
,(u'Politica' , u'http://feeds.feedburner.com/lavanguardia/politica' )
|
,(u'Politica' , u'http://feeds.feedburner.com/lavanguardia/politica' )
|
||||||
,(u'Sucessos' , u'http://feeds.feedburner.com/lavanguardia/sucesos' )
|
,(u'Sucesos' , u'http://feeds.feedburner.com/lavanguardia/sucesos' )
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
keep_only_tags = [
|
keep_only_tags = [
|
||||||
dict(name='div', attrs={'class':'element1_3'})
|
dict(name='div', attrs={'class':'detalle noticia'})
|
||||||
]
|
]
|
||||||
|
|
||||||
remove_tags = [
|
remove_tags = [
|
||||||
dict(name=['object','link','script'])
|
dict(name=['object','link','script'])
|
||||||
,dict(name='div', attrs={'class':['colC','peu']})
|
,dict(name='div', attrs={'class':['colC','peu','jstoolbar']})
|
||||||
]
|
]
|
||||||
|
|
||||||
remove_tags_after = [dict(name='div', attrs={'class':'text'})]
|
remove_tags_after = [dict(name='div', attrs={'class':'text'})]
|
||||||
@ -67,4 +67,3 @@ class LaVanguardia(BasicNewsRecipe):
|
|||||||
for item in soup.findAll(style=True):
|
for item in soup.findAll(style=True):
|
||||||
del item['style']
|
del item['style']
|
||||||
return soup
|
return soup
|
||||||
|
|
||||||
|
32
resources/recipes/mail_and_guardian.recipe
Normal file
32
resources/recipes/mail_and_guardian.recipe
Normal file
@ -0,0 +1,32 @@
|
|||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class AdvancedUserRecipe1295081935(BasicNewsRecipe):
|
||||||
|
title = u'Mail & Guardian ZA News'
|
||||||
|
__author__ = '77ja65'
|
||||||
|
language = 'en'
|
||||||
|
oldest_article = 7
|
||||||
|
max_articles_per_feed = 30
|
||||||
|
no_stylesheets = True
|
||||||
|
masthead_url = 'http://c1608832.cdn.cloudfiles.rackspacecloud.com/mg_logo.gif'
|
||||||
|
remove_tags_after = [dict(id='content')]
|
||||||
|
|
||||||
|
feeds = [
|
||||||
|
(u'National News', u'http://www.mg.co.za/rss/national'),
|
||||||
|
(u'Top Stories', u'http://www.mg.co.za/rss'),
|
||||||
|
(u'Africa News', u'http://www.mg.co.za/rss/africa'),
|
||||||
|
(u'Sport', u'http://www.mg.co.za/rss/sport'),
|
||||||
|
(u'Business', u'http://www.mg.co.za/rss/business'),
|
||||||
|
(u'And In Other News', u'http://www.mg.co.za/rss/and-in-other-news'),
|
||||||
|
(u'World News', u'http://www.mg.co.za/rss/world')
|
||||||
|
]
|
||||||
|
|
||||||
|
def print_version(self, url):
|
||||||
|
return url.replace('http://www.mg.co.za/article/',
|
||||||
|
'http://www.mg.co.za/printformat/single/')
|
||||||
|
|
||||||
|
extra_css = '''
|
||||||
|
h1{font-family:Arial,Helvetica,sans-serif; font-
|
||||||
|
weight:bold;font-size:large;}
|
||||||
|
h2{font-family:Arial,Helvetica,sans-serif; font-
|
||||||
|
weight:normal;font-size:small;}
|
||||||
|
'''
|
@ -1,10 +1,9 @@
|
|||||||
__license__ = 'GPL v3'
|
__license__ = 'GPL v3'
|
||||||
__copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>'
|
__copyright__ = '2010-2011, Darko Miletic <darko.miletic at gmail.com>'
|
||||||
'''
|
'''
|
||||||
msnbc.msn.com
|
msnbc.msn.com
|
||||||
'''
|
'''
|
||||||
|
|
||||||
import re
|
|
||||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||||
|
|
||||||
class MsNBC(BasicNewsRecipe):
|
class MsNBC(BasicNewsRecipe):
|
||||||
@ -19,7 +18,16 @@ class MsNBC(BasicNewsRecipe):
|
|||||||
publisher = 'msnbc.com'
|
publisher = 'msnbc.com'
|
||||||
category = 'news, USA, world'
|
category = 'news, USA, world'
|
||||||
language = 'en'
|
language = 'en'
|
||||||
extra_css = ' body{ font-family: sans-serif } .head{font-family: serif; font-size: xx-large; font-weight: bold; color: #CC0000} .abstract{font-weight: bold} .source{font-size: small} .updateTime{font-size: small} '
|
extra_css = """
|
||||||
|
body{ font-family: Georgia,Times,serif }
|
||||||
|
.hide{display: none}
|
||||||
|
.caption{font-family: Arial,sans-serif; font-size: x-small}
|
||||||
|
.entry-summary{font-family: Arial,sans-serif}
|
||||||
|
.copyright{font-size: 0.95em; font-style: italic}
|
||||||
|
.source-org{font-size: small; font-family: Arial,sans-serif}
|
||||||
|
img{display: block; margin-bottom: 0.5em}
|
||||||
|
span.byline{display: none}
|
||||||
|
"""
|
||||||
|
|
||||||
conversion_options = {
|
conversion_options = {
|
||||||
'comments' : description
|
'comments' : description
|
||||||
@ -28,14 +36,20 @@ class MsNBC(BasicNewsRecipe):
|
|||||||
,'publisher': publisher
|
,'publisher': publisher
|
||||||
}
|
}
|
||||||
|
|
||||||
preprocess_regexps = [
|
remove_tags_before = dict(name='h1', attrs={'id':'headline'})
|
||||||
(re.compile(r'</style></head>', re.DOTALL|re.IGNORECASE),lambda match: '</style>')
|
remove_tags_after = dict(name='span', attrs={'class':['copyright','Linear copyright']})
|
||||||
,(re.compile(r'<div class="head">', re.DOTALL|re.IGNORECASE),lambda match: '</head><body><div class="head">'),
|
keep_only_tags=[
|
||||||
|
dict(attrs={'id':['headline','deck','byline','source','intelliTXT']})
|
||||||
|
,dict(attrs={'class':['gl_headline','articleText','drawer-content Linear','v-center3','byline','textBodyBlack']})
|
||||||
|
]
|
||||||
|
remove_attributes=['property','lang','rel','xmlns:fb','xmlns:v','xmlns:dc','xmlns:dcmitype','xmlns:og','xmlns:media','xmlns:vcard','typeof','itemscope','itemtype','itemprop','about','type','size','width','height','onreadystatechange','data','border','hspace','vspace']
|
||||||
|
|
||||||
|
remove_tags = [
|
||||||
|
dict(name=['iframe','object','link','embed','meta','table'])
|
||||||
|
,dict(name='span', attrs={'class':['copyright','Linear copyright']})
|
||||||
|
,dict(name='div', attrs={'class':'social'})
|
||||||
]
|
]
|
||||||
|
|
||||||
remove_tags_before = dict(name='div', attrs={'class':'head'})
|
|
||||||
remove_tags_after = dict(name='div', attrs={'class':'copyright'})
|
|
||||||
remove_tags = [dict(name=['iframe','object','link','script','form'])]
|
|
||||||
|
|
||||||
feeds = [
|
feeds = [
|
||||||
(u'US News' , u'http://rss.msnbc.msn.com/id/3032524/device/rss/rss.xml' )
|
(u'US News' , u'http://rss.msnbc.msn.com/id/3032524/device/rss/rss.xml' )
|
||||||
@ -48,11 +62,26 @@ class MsNBC(BasicNewsRecipe):
|
|||||||
,(u'Tech & Science', u'http://rss.msnbc.msn.com/id/3032117/device/rss/rss.xml' )
|
,(u'Tech & Science', u'http://rss.msnbc.msn.com/id/3032117/device/rss/rss.xml' )
|
||||||
]
|
]
|
||||||
|
|
||||||
def print_version(self, url):
|
|
||||||
return url + 'print/1/displaymode/1098/'
|
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
def preprocess_html(self, soup):
|
||||||
for item in soup.head.findAll('div'):
|
for item in soup.body.findAll('html'):
|
||||||
|
item.name='div'
|
||||||
|
for item in soup.body.findAll('div'):
|
||||||
|
if item.has_key('id') and item['id'].startswith('vine-'):
|
||||||
item.extract()
|
item.extract()
|
||||||
|
if item.has_key('class') and ( item['class'].startswith('ad') or item['class'].startswith('vine')):
|
||||||
|
item.extract()
|
||||||
|
for item in soup.body.findAll('img'):
|
||||||
|
if not item.has_key('alt'):
|
||||||
|
item['alt'] = 'image'
|
||||||
|
for item in soup.body.findAll('ol'):
|
||||||
|
if item.has_key('class') and item['class'].startswith('grid'):
|
||||||
|
item.extract()
|
||||||
|
for item in soup.body.findAll('span'):
|
||||||
|
if ( item.has_key('id') and item['id'].startswith('byLine') and item.string is None) or ( item.has_key('class') and item['class'].startswith('inline') ):
|
||||||
|
item.extract()
|
||||||
|
for alink in soup.findAll('a'):
|
||||||
|
if alink.string is not None:
|
||||||
|
tstr = alink.string
|
||||||
|
alink.replaceWith(tstr)
|
||||||
return soup
|
return soup
|
||||||
|
|
||||||
|
@ -10,6 +10,7 @@ import re
|
|||||||
class NationalGeographicNews(BasicNewsRecipe):
|
class NationalGeographicNews(BasicNewsRecipe):
|
||||||
title = u'National Geographic News'
|
title = u'National Geographic News'
|
||||||
oldest_article = 7
|
oldest_article = 7
|
||||||
|
language = 'en'
|
||||||
max_articles_per_feed = 100
|
max_articles_per_feed = 100
|
||||||
remove_javascript = True
|
remove_javascript = True
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
|
@ -27,6 +27,9 @@ class NikkeiNet_sub_economy(BasicNewsRecipe):
|
|||||||
{'class':"JSID_basePageMove JSID_baseAsyncSubmit cmn-form_area JSID_optForm_utoken"},
|
{'class':"JSID_basePageMove JSID_baseAsyncSubmit cmn-form_area JSID_optForm_utoken"},
|
||||||
{'class':"cmn-article_keyword cmn-clearfix"},
|
{'class':"cmn-article_keyword cmn-clearfix"},
|
||||||
{'class':"cmn-print_headline cmn-clearfix"},
|
{'class':"cmn-print_headline cmn-clearfix"},
|
||||||
|
{'class':"cmn-article_list"},
|
||||||
|
dict(id="ABOUT-NIKKEI"),
|
||||||
|
{'class':"cmn-sub_market"},
|
||||||
]
|
]
|
||||||
remove_tags_after = {'class':"cmn-pr_list"}
|
remove_tags_after = {'class':"cmn-pr_list"}
|
||||||
|
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
__license__ = 'GPL v3'
|
__license__ = 'GPL v3'
|
||||||
__copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>'
|
__copyright__ = '2010-2011, Darko Miletic <darko.miletic at gmail.com>'
|
||||||
'''
|
'''
|
||||||
nrc.nl
|
nrc.nl
|
||||||
'''
|
'''
|
||||||
@ -15,13 +15,18 @@ class Pagina12(BasicNewsRecipe):
|
|||||||
oldest_article = 2
|
oldest_article = 2
|
||||||
max_articles_per_feed = 200
|
max_articles_per_feed = 200
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
encoding = 'cp1252'
|
encoding = 'utf8'
|
||||||
use_embedded_content = False
|
use_embedded_content = False
|
||||||
language = 'nl'
|
language = 'nl'
|
||||||
country = 'NL'
|
country = 'NL'
|
||||||
remove_empty_feeds = True
|
remove_empty_feeds = True
|
||||||
masthead_url = 'http://www.nrc.nl/nrc.nl/images/logo_nrc.png'
|
masthead_url = 'http://www.nrc.nl/nrc.nl/images/logo_nrc.png'
|
||||||
extra_css = ' body{font-family: Verdana,Arial,Helvetica,sans-serif } img{margin-bottom: 0.4em} h1,h2,h3{text-align:left} '
|
extra_css = """
|
||||||
|
body{font-family: Georgia,serif }
|
||||||
|
img{margin-bottom: 0.4em; display: block}
|
||||||
|
.bijschrift,.sectie{font-size: x-small}
|
||||||
|
.sectie{color: gray}
|
||||||
|
"""
|
||||||
|
|
||||||
conversion_options = {
|
conversion_options = {
|
||||||
'comment' : description
|
'comment' : description
|
||||||
@ -30,21 +35,42 @@ class Pagina12(BasicNewsRecipe):
|
|||||||
, 'language' : language
|
, 'language' : language
|
||||||
}
|
}
|
||||||
|
|
||||||
keep_only_tags = [dict(name='div',attrs={'class':'article clearfix'})]
|
keep_only_tags = [dict(attrs={'class':'uitstekendekeus'})]
|
||||||
|
remove_tags = [
|
||||||
|
dict(name=['meta','base','link','object','embed'])
|
||||||
|
,dict(attrs={'class':['reclamespace','tags-and-sharing']})
|
||||||
|
]
|
||||||
|
remove_attributes=['lang']
|
||||||
|
|
||||||
feeds = [
|
feeds = [
|
||||||
(u'Voorpagina' , u'http://feeds.feedburner.com/NRCHandelsbladVoorpagina' )
|
(u'Voor nieuws', u'http://www.nrc.nl/nieuws/categorie/nieuws/rss.php' )
|
||||||
,(u'Binnenland' , u'http://feeds.feedburner.com/NRCHandelsbladBinnenland' )
|
,(u'Binnenland' , u'http://www.nrc.nl/nieuws/categorie/binnenland/rss.php' )
|
||||||
,(u'Buitenland' , u'http://feeds.feedburner.com/NRCHandelsbladBuitenland' )
|
,(u'Buitenland' , u'http://www.nrc.nl/nieuws/categorie/buitenland/rss.php' )
|
||||||
,(u'Economie' , u'http://feeds.feedburner.com/NRCHandelsbladEconomie' )
|
,(u'Economie' , u'http://www.nrc.nl/nieuws/categorie/economie/rss.php' )
|
||||||
,(u'Kunst & Film' , u'http://feeds.feedburner.com/nrc/NRCHandelsbladKunstEnFilm')
|
,(u'Cultuur' , u'http://www.nrc.nl/nieuws/categorie/cultuur/rss.php' )
|
||||||
,(u'Sport' , u'http://feeds.feedburner.com/NRCHandelsbladSport' )
|
,(u'Sport' , u'http://www.nrc.nl/nieuws/categorie/sport/rss.php' )
|
||||||
,(u'Wetenschap ' , u'http://www.nrc.nl/rss/wetenschap' )
|
,(u'Wetenschap ', u'http://www.nrc.nl/nieuws/categorie/wetenschap-nieuws/rss.php')
|
||||||
]
|
]
|
||||||
|
|
||||||
def print_version(self, url):
|
|
||||||
return url + '?service=Print'
|
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
def preprocess_html(self, soup):
|
||||||
return self.adeify_images(soup)
|
for item in soup.findAll(style=True):
|
||||||
|
del item['style']
|
||||||
|
for item in soup.findAll('a'):
|
||||||
|
limg = item.find('img')
|
||||||
|
if item.string is not None:
|
||||||
|
str = item.string
|
||||||
|
item.replaceWith(str)
|
||||||
|
else:
|
||||||
|
if limg:
|
||||||
|
item.name = 'div'
|
||||||
|
atritems =['href','target','rel']
|
||||||
|
for atit in atritems:
|
||||||
|
if item.has_key(atit):
|
||||||
|
del item[atit]
|
||||||
|
else:
|
||||||
|
str = self.tag_to_string(item)
|
||||||
|
item.replaceWith(str)
|
||||||
|
for item in soup.findAll('img'):
|
||||||
|
if not item.has_key('alt'):
|
||||||
|
item['alt'] = 'image'
|
||||||
|
return soup
|
||||||
|
@ -586,7 +586,7 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
return self.strip_anchors(soup)
|
return self.strip_anchors(soup)
|
||||||
|
|
||||||
def postprocess_html(self,soup, True):
|
def postprocess_html(self,soup, True):
|
||||||
|
try:
|
||||||
if self.one_picture_per_article:
|
if self.one_picture_per_article:
|
||||||
# Remove all images after first
|
# Remove all images after first
|
||||||
largeImg = soup.find(True, {'class':'articleSpanImage'})
|
largeImg = soup.find(True, {'class':'articleSpanImage'})
|
||||||
@ -621,10 +621,13 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
cgFirst.insert(insertLoc,firstImg)
|
cgFirst.insert(insertLoc,firstImg)
|
||||||
else:
|
else:
|
||||||
self.log(">>> No class:'columnGroup first' found <<<")
|
self.log(">>> No class:'columnGroup first' found <<<")
|
||||||
|
except:
|
||||||
|
self.log("ERROR: One picture per article in postprocess_html")
|
||||||
|
|
||||||
|
try:
|
||||||
# Change captions to italic
|
# Change captions to italic
|
||||||
for caption in soup.findAll(True, {'class':'caption'}) :
|
for caption in soup.findAll(True, {'class':'caption'}) :
|
||||||
if caption and caption.contents[0]:
|
if caption and len(caption) > 0:
|
||||||
cTag = Tag(soup, "p", [("class", "caption")])
|
cTag = Tag(soup, "p", [("class", "caption")])
|
||||||
c = self.fixChars(self.tag_to_string(caption,use_alt=False)).strip()
|
c = self.fixChars(self.tag_to_string(caption,use_alt=False)).strip()
|
||||||
mp_off = c.find("More Photos")
|
mp_off = c.find("More Photos")
|
||||||
@ -632,7 +635,10 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
c = c[:mp_off]
|
c = c[:mp_off]
|
||||||
cTag.insert(0, c)
|
cTag.insert(0, c)
|
||||||
caption.replaceWith(cTag)
|
caption.replaceWith(cTag)
|
||||||
|
except:
|
||||||
|
self.log("ERROR: Problem in change captions to italic")
|
||||||
|
|
||||||
|
try:
|
||||||
# Change <nyt_headline> to <h2>
|
# Change <nyt_headline> to <h2>
|
||||||
h1 = soup.find('h1')
|
h1 = soup.find('h1')
|
||||||
if h1:
|
if h1:
|
||||||
@ -653,7 +659,10 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
hrs = soup.findAll('hr')
|
hrs = soup.findAll('hr')
|
||||||
for hr in hrs:
|
for hr in hrs:
|
||||||
hr.extract()
|
hr.extract()
|
||||||
|
except:
|
||||||
|
self.log("ERROR: Problem in Change <nyt_headline> to <h2>")
|
||||||
|
|
||||||
|
try:
|
||||||
# Change <h1> to <h3> - used in editorial blogs
|
# Change <h1> to <h3> - used in editorial blogs
|
||||||
masthead = soup.find("h1")
|
masthead = soup.find("h1")
|
||||||
if masthead:
|
if masthead:
|
||||||
@ -663,18 +672,27 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
tag = Tag(soup, "h3")
|
tag = Tag(soup, "h3")
|
||||||
tag.insert(0, self.fixChars(masthead.contents[0]))
|
tag.insert(0, self.fixChars(masthead.contents[0]))
|
||||||
masthead.replaceWith(tag)
|
masthead.replaceWith(tag)
|
||||||
|
except:
|
||||||
|
self.log("ERROR: Problem in Change <h1> to <h3> - used in editorial blogs")
|
||||||
|
|
||||||
|
try:
|
||||||
# Change <span class="bold"> to <b>
|
# Change <span class="bold"> to <b>
|
||||||
for subhead in soup.findAll(True, {'class':'bold'}) :
|
for subhead in soup.findAll(True, {'class':'bold'}) :
|
||||||
if subhead.contents:
|
if subhead.contents:
|
||||||
bTag = Tag(soup, "b")
|
bTag = Tag(soup, "b")
|
||||||
bTag.insert(0, subhead.contents[0])
|
bTag.insert(0, subhead.contents[0])
|
||||||
subhead.replaceWith(bTag)
|
subhead.replaceWith(bTag)
|
||||||
|
except:
|
||||||
|
self.log("ERROR: Problem in Change <h1> to <h3> - used in editorial blogs")
|
||||||
|
|
||||||
|
try:
|
||||||
divTag = soup.find('div',attrs={'id':'articleBody'})
|
divTag = soup.find('div',attrs={'id':'articleBody'})
|
||||||
if divTag:
|
if divTag:
|
||||||
divTag['class'] = divTag['id']
|
divTag['class'] = divTag['id']
|
||||||
|
except:
|
||||||
|
self.log("ERROR: Problem in soup.find(div,attrs={id:articleBody})")
|
||||||
|
|
||||||
|
try:
|
||||||
# Add class="authorId" to <div> so we can format with CSS
|
# Add class="authorId" to <div> so we can format with CSS
|
||||||
divTag = soup.find('div',attrs={'id':'authorId'})
|
divTag = soup.find('div',attrs={'id':'authorId'})
|
||||||
if divTag and divTag.contents[0]:
|
if divTag and divTag.contents[0]:
|
||||||
@ -683,5 +701,32 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
tag.insert(0, self.fixChars(self.tag_to_string(divTag.contents[0],
|
tag.insert(0, self.fixChars(self.tag_to_string(divTag.contents[0],
|
||||||
use_alt=False)))
|
use_alt=False)))
|
||||||
divTag.replaceWith(tag)
|
divTag.replaceWith(tag)
|
||||||
|
except:
|
||||||
|
self.log("ERROR: Problem in Add class=authorId to <div> so we can format with CSS")
|
||||||
|
|
||||||
return soup
|
return soup
|
||||||
|
|
||||||
|
def populate_article_metadata(self, article, soup, first):
|
||||||
|
shortparagraph = ""
|
||||||
|
try:
|
||||||
|
if len(article.text_summary.strip()) == 0:
|
||||||
|
articlebodies = soup.findAll('div',attrs={'class':'articleBody'})
|
||||||
|
if articlebodies:
|
||||||
|
for articlebody in articlebodies:
|
||||||
|
if articlebody:
|
||||||
|
paras = articlebody.findAll('p')
|
||||||
|
for p in paras:
|
||||||
|
refparagraph = self.massageNCXText(self.tag_to_string(p,use_alt=False)).strip()
|
||||||
|
#account for blank paragraphs and short paragraphs by appending them to longer ones
|
||||||
|
if len(refparagraph) > 0:
|
||||||
|
if len(refparagraph) > 70: #approximately one line of text
|
||||||
|
article.summary = article.text_summary = shortparagraph + refparagraph
|
||||||
|
return
|
||||||
|
else:
|
||||||
|
shortparagraph = refparagraph + " "
|
||||||
|
if shortparagraph.strip().find(" ") == -1 and not shortparagraph.strip().endswith(":"):
|
||||||
|
shortparagraph = shortparagraph + "- "
|
||||||
|
except:
|
||||||
|
self.log("Error creating article descriptions")
|
||||||
|
return
|
||||||
|
|
||||||
|
@ -1,4 +1,5 @@
|
|||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
__license__ = 'GPL v3'
|
__license__ = 'GPL v3'
|
||||||
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
|
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||||
@ -23,6 +24,10 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
webEdition = False
|
webEdition = False
|
||||||
oldest_article = 7
|
oldest_article = 7
|
||||||
|
|
||||||
|
# replace paid Kindle Version: the name will be changed to "The New York Times" to cause
|
||||||
|
# previous paid versions of the new york times to best sent to the back issues folder on the kindle
|
||||||
|
replaceKindleVersion = False
|
||||||
|
|
||||||
# includeSections: List of sections to include. If empty, all sections found will be included.
|
# includeSections: List of sections to include. If empty, all sections found will be included.
|
||||||
# Otherwise, only the sections named will be included. For example,
|
# Otherwise, only the sections named will be included. For example,
|
||||||
#
|
#
|
||||||
@ -94,6 +99,10 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
title='New York Times (Web)'
|
title='New York Times (Web)'
|
||||||
description = 'New York Times on the Web'
|
description = 'New York Times on the Web'
|
||||||
needs_subscription = True
|
needs_subscription = True
|
||||||
|
elif replaceKindleVersion:
|
||||||
|
title='The New York Times'
|
||||||
|
description = 'Today\'s New York Times'
|
||||||
|
needs_subscription = True
|
||||||
else:
|
else:
|
||||||
title='New York Times'
|
title='New York Times'
|
||||||
description = 'Today\'s New York Times'
|
description = 'Today\'s New York Times'
|
||||||
@ -150,6 +159,11 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
'relatedSearchesModule',
|
'relatedSearchesModule',
|
||||||
'side_tool',
|
'side_tool',
|
||||||
'singleAd',
|
'singleAd',
|
||||||
|
'entry entry-utility', #added for DealBook
|
||||||
|
'entry-tags', #added for DealBook
|
||||||
|
'footer promos clearfix', #added for DealBook
|
||||||
|
'footer links clearfix', #added for DealBook
|
||||||
|
'inlineImage module', #added for DealBook
|
||||||
re.compile('^subNavigation'),
|
re.compile('^subNavigation'),
|
||||||
re.compile('^leaderboard'),
|
re.compile('^leaderboard'),
|
||||||
re.compile('^module'),
|
re.compile('^module'),
|
||||||
@ -183,6 +197,9 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
'side_index',
|
'side_index',
|
||||||
'side_tool',
|
'side_tool',
|
||||||
'toolsRight',
|
'toolsRight',
|
||||||
|
'skybox', #added for DealBook
|
||||||
|
'TopAd', #added for DealBook
|
||||||
|
'related-content', #added for DealBook
|
||||||
]),
|
]),
|
||||||
dict(name=['script', 'noscript', 'style','form','hr'])]
|
dict(name=['script', 'noscript', 'style','form','hr'])]
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
@ -237,7 +254,7 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
def exclude_url(self,url):
|
def exclude_url(self,url):
|
||||||
if not url.startswith("http"):
|
if not url.startswith("http"):
|
||||||
return True
|
return True
|
||||||
if not url.endswith(".html"):
|
if not url.endswith(".html") and 'dealbook.nytimes.com' not in url: #added for DealBook
|
||||||
return True
|
return True
|
||||||
if 'nytimes.com' not in url:
|
if 'nytimes.com' not in url:
|
||||||
return True
|
return True
|
||||||
@ -560,7 +577,6 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
|
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
def preprocess_html(self, soup):
|
||||||
|
|
||||||
if self.webEdition & (self.oldest_article>0):
|
if self.webEdition & (self.oldest_article>0):
|
||||||
date_tag = soup.find(True,attrs={'class': ['dateline','date']})
|
date_tag = soup.find(True,attrs={'class': ['dateline','date']})
|
||||||
if date_tag:
|
if date_tag:
|
||||||
@ -583,10 +599,13 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
img_div = soup.find('div','inlineImage module')
|
img_div = soup.find('div','inlineImage module')
|
||||||
if img_div:
|
if img_div:
|
||||||
img_div.extract()
|
img_div.extract()
|
||||||
|
|
||||||
|
|
||||||
return self.strip_anchors(soup)
|
return self.strip_anchors(soup)
|
||||||
|
|
||||||
def postprocess_html(self,soup, True):
|
def postprocess_html(self,soup, True):
|
||||||
|
|
||||||
|
try:
|
||||||
if self.one_picture_per_article:
|
if self.one_picture_per_article:
|
||||||
# Remove all images after first
|
# Remove all images after first
|
||||||
largeImg = soup.find(True, {'class':'articleSpanImage'})
|
largeImg = soup.find(True, {'class':'articleSpanImage'})
|
||||||
@ -621,10 +640,13 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
cgFirst.insert(insertLoc,firstImg)
|
cgFirst.insert(insertLoc,firstImg)
|
||||||
else:
|
else:
|
||||||
self.log(">>> No class:'columnGroup first' found <<<")
|
self.log(">>> No class:'columnGroup first' found <<<")
|
||||||
|
except:
|
||||||
|
self.log("ERROR: One picture per article in postprocess_html")
|
||||||
|
|
||||||
|
try:
|
||||||
# Change captions to italic
|
# Change captions to italic
|
||||||
for caption in soup.findAll(True, {'class':'caption'}) :
|
for caption in soup.findAll(True, {'class':'caption'}) :
|
||||||
if caption and caption.contents[0]:
|
if caption and len(caption) > 0:
|
||||||
cTag = Tag(soup, "p", [("class", "caption")])
|
cTag = Tag(soup, "p", [("class", "caption")])
|
||||||
c = self.fixChars(self.tag_to_string(caption,use_alt=False)).strip()
|
c = self.fixChars(self.tag_to_string(caption,use_alt=False)).strip()
|
||||||
mp_off = c.find("More Photos")
|
mp_off = c.find("More Photos")
|
||||||
@ -632,9 +654,13 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
c = c[:mp_off]
|
c = c[:mp_off]
|
||||||
cTag.insert(0, c)
|
cTag.insert(0, c)
|
||||||
caption.replaceWith(cTag)
|
caption.replaceWith(cTag)
|
||||||
|
except:
|
||||||
|
self.log("ERROR: Problem in change captions to italic")
|
||||||
|
|
||||||
|
try:
|
||||||
# Change <nyt_headline> to <h2>
|
# Change <nyt_headline> to <h2>
|
||||||
h1 = soup.find('h1')
|
h1 = soup.find('h1')
|
||||||
|
blogheadline = str(h1) #added for dealbook
|
||||||
if h1:
|
if h1:
|
||||||
headline = h1.find("nyt_headline")
|
headline = h1.find("nyt_headline")
|
||||||
if headline:
|
if headline:
|
||||||
@ -642,18 +668,50 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
tag['class'] = "headline"
|
tag['class'] = "headline"
|
||||||
tag.insert(0, self.fixChars(headline.contents[0]))
|
tag.insert(0, self.fixChars(headline.contents[0]))
|
||||||
h1.replaceWith(tag)
|
h1.replaceWith(tag)
|
||||||
|
elif blogheadline.find('entry-title'):#added for dealbook
|
||||||
|
tag = Tag(soup, "h2")#added for dealbook
|
||||||
|
tag['class'] = "headline"#added for dealbook
|
||||||
|
tag.insert(0, self.fixChars(h1.contents[0]))#added for dealbook
|
||||||
|
h1.replaceWith(tag)#added for dealbook
|
||||||
|
|
||||||
else:
|
else:
|
||||||
# Blog entry - replace headline, remove <hr> tags
|
# Blog entry - replace headline, remove <hr> tags - BCC I think this is no longer functional 1-18-2011
|
||||||
headline = soup.find('title')
|
headline = soup.find('title')
|
||||||
if headline:
|
if headline:
|
||||||
tag = Tag(soup, "h2")
|
tag = Tag(soup, "h2")
|
||||||
tag['class'] = "headline"
|
tag['class'] = "headline"
|
||||||
tag.insert(0, self.fixChars(headline.contents[0]))
|
tag.insert(0, self.fixChars(headline.renderContents()))
|
||||||
soup.insert(0, tag)
|
soup.insert(0, tag)
|
||||||
hrs = soup.findAll('hr')
|
hrs = soup.findAll('hr')
|
||||||
for hr in hrs:
|
for hr in hrs:
|
||||||
hr.extract()
|
hr.extract()
|
||||||
|
except:
|
||||||
|
self.log("ERROR: Problem in Change <nyt_headline> to <h2>")
|
||||||
|
|
||||||
|
try:
|
||||||
|
#if this is from a blog (dealbook, fix the byline format
|
||||||
|
bylineauthor = soup.find('address',attrs={'class':'byline author vcard'})
|
||||||
|
if bylineauthor:
|
||||||
|
tag = Tag(soup, "h6")
|
||||||
|
tag['class'] = "byline"
|
||||||
|
tag.insert(0, self.fixChars(bylineauthor.renderContents()))
|
||||||
|
bylineauthor.replaceWith(tag)
|
||||||
|
except:
|
||||||
|
self.log("ERROR: fixing byline author format")
|
||||||
|
|
||||||
|
try:
|
||||||
|
#if this is a blog (dealbook) fix the credit style for the pictures
|
||||||
|
blogcredit = soup.find('div',attrs={'class':'credit'})
|
||||||
|
if blogcredit:
|
||||||
|
tag = Tag(soup, "h6")
|
||||||
|
tag['class'] = "credit"
|
||||||
|
tag.insert(0, self.fixChars(blogcredit.renderContents()))
|
||||||
|
blogcredit.replaceWith(tag)
|
||||||
|
except:
|
||||||
|
self.log("ERROR: fixing credit format")
|
||||||
|
|
||||||
|
|
||||||
|
try:
|
||||||
# Change <h1> to <h3> - used in editorial blogs
|
# Change <h1> to <h3> - used in editorial blogs
|
||||||
masthead = soup.find("h1")
|
masthead = soup.find("h1")
|
||||||
if masthead:
|
if masthead:
|
||||||
@ -663,18 +721,34 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
tag = Tag(soup, "h3")
|
tag = Tag(soup, "h3")
|
||||||
tag.insert(0, self.fixChars(masthead.contents[0]))
|
tag.insert(0, self.fixChars(masthead.contents[0]))
|
||||||
masthead.replaceWith(tag)
|
masthead.replaceWith(tag)
|
||||||
|
except:
|
||||||
|
self.log("ERROR: Problem in Change <h1> to <h3> - used in editorial blogs")
|
||||||
|
|
||||||
|
try:
|
||||||
# Change <span class="bold"> to <b>
|
# Change <span class="bold"> to <b>
|
||||||
for subhead in soup.findAll(True, {'class':'bold'}) :
|
for subhead in soup.findAll(True, {'class':'bold'}) :
|
||||||
if subhead.contents:
|
if subhead.contents:
|
||||||
bTag = Tag(soup, "b")
|
bTag = Tag(soup, "b")
|
||||||
bTag.insert(0, subhead.contents[0])
|
bTag.insert(0, subhead.contents[0])
|
||||||
subhead.replaceWith(bTag)
|
subhead.replaceWith(bTag)
|
||||||
|
except:
|
||||||
|
self.log("ERROR: Problem in Change <h1> to <h3> - used in editorial blogs")
|
||||||
|
try:
|
||||||
|
#remove the <strong> update tag
|
||||||
|
blogupdated = soup.find('span', {'class':'update'})
|
||||||
|
if blogupdated:
|
||||||
|
blogupdated.replaceWith("")
|
||||||
|
except:
|
||||||
|
self.log("ERROR: Removing strong tag")
|
||||||
|
|
||||||
|
try:
|
||||||
divTag = soup.find('div',attrs={'id':'articleBody'})
|
divTag = soup.find('div',attrs={'id':'articleBody'})
|
||||||
if divTag:
|
if divTag:
|
||||||
divTag['class'] = divTag['id']
|
divTag['class'] = divTag['id']
|
||||||
|
except:
|
||||||
|
self.log("ERROR: Problem in soup.find(div,attrs={id:articleBody})")
|
||||||
|
|
||||||
|
try:
|
||||||
# Add class="authorId" to <div> so we can format with CSS
|
# Add class="authorId" to <div> so we can format with CSS
|
||||||
divTag = soup.find('div',attrs={'id':'authorId'})
|
divTag = soup.find('div',attrs={'id':'authorId'})
|
||||||
if divTag and divTag.contents[0]:
|
if divTag and divTag.contents[0]:
|
||||||
@ -683,6 +757,31 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
tag.insert(0, self.fixChars(self.tag_to_string(divTag.contents[0],
|
tag.insert(0, self.fixChars(self.tag_to_string(divTag.contents[0],
|
||||||
use_alt=False)))
|
use_alt=False)))
|
||||||
divTag.replaceWith(tag)
|
divTag.replaceWith(tag)
|
||||||
|
except:
|
||||||
|
self.log("ERROR: Problem in Add class=authorId to <div> so we can format with CSS")
|
||||||
|
|
||||||
return soup
|
return soup
|
||||||
|
def populate_article_metadata(self, article, soup, first):
|
||||||
|
shortparagraph = ""
|
||||||
|
try:
|
||||||
|
if len(article.text_summary.strip()) == 0:
|
||||||
|
articlebodies = soup.findAll('div',attrs={'class':'articleBody'})
|
||||||
|
if articlebodies:
|
||||||
|
for articlebody in articlebodies:
|
||||||
|
if articlebody:
|
||||||
|
paras = articlebody.findAll('p')
|
||||||
|
for p in paras:
|
||||||
|
refparagraph = self.massageNCXText(self.tag_to_string(p,use_alt=False)).strip()
|
||||||
|
#account for blank paragraphs and short paragraphs by appending them to longer ones
|
||||||
|
if len(refparagraph) > 0:
|
||||||
|
if len(refparagraph) > 70: #approximately one line of text
|
||||||
|
article.summary = article.text_summary = shortparagraph + refparagraph
|
||||||
|
return
|
||||||
|
else:
|
||||||
|
shortparagraph = refparagraph + " "
|
||||||
|
if shortparagraph.strip().find(" ") == -1 and not shortparagraph.strip().endswith(":"):
|
||||||
|
shortparagraph = shortparagraph + "- "
|
||||||
|
except:
|
||||||
|
self.log("Error creating article descriptions")
|
||||||
|
return
|
||||||
|
|
||||||
|
61
resources/recipes/pressthink.recipe
Normal file
61
resources/recipes/pressthink.recipe
Normal file
@ -0,0 +1,61 @@
|
|||||||
|
__license__ = 'GPL v3'
|
||||||
|
__copyright__ = '2011, Darko Miletic <darko.miletic at gmail.com>'
|
||||||
|
'''
|
||||||
|
pressthink.org
|
||||||
|
'''
|
||||||
|
|
||||||
|
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||||
|
|
||||||
|
class PressThink(BasicNewsRecipe):
|
||||||
|
title = 'PressThink'
|
||||||
|
__author__ = 'Darko Miletic'
|
||||||
|
description = 'Ghost of democracy in the media machine'
|
||||||
|
oldest_article = 60
|
||||||
|
max_articles_per_feed = 100
|
||||||
|
no_stylesheets = True
|
||||||
|
use_embedded_content = False
|
||||||
|
encoding = 'utf8'
|
||||||
|
publisher = 'Arthur L. Carter Journalism Institute'
|
||||||
|
category = 'news, USA, world, economy, politics, media'
|
||||||
|
language = 'en'
|
||||||
|
publication_type = 'blog'
|
||||||
|
extra_css = """
|
||||||
|
body{ font-family: Helvetica,Arial,sans-serif }
|
||||||
|
img{display: block; margin-bottom: 0.5em}
|
||||||
|
h6{font-size: 1.1em; font-weight: bold}
|
||||||
|
.post-author{font-family: Georgia,serif}
|
||||||
|
.post-title{color: #AB0000}
|
||||||
|
.says{color: gray}
|
||||||
|
.comment {
|
||||||
|
border-bottom: 1px dotted #555555;
|
||||||
|
border-top: 1px dotted #DDDDDD;
|
||||||
|
margin-left: 10px;
|
||||||
|
min-height: 100px;
|
||||||
|
padding: 15px 0 20px;
|
||||||
|
}
|
||||||
|
"""
|
||||||
|
|
||||||
|
conversion_options = {
|
||||||
|
'comments' : description
|
||||||
|
,'tags' : category
|
||||||
|
,'language' : language
|
||||||
|
,'publisher': publisher
|
||||||
|
}
|
||||||
|
|
||||||
|
remove_tags = [dict(name=['form','iframe','embed','object','link','base','table','meta'])]
|
||||||
|
keep_only_tags = [dict(attrs={'class':['post-title','post-author','entry','postmetadata alt','commentlist']})]
|
||||||
|
|
||||||
|
feeds = [(u'Articles', u'http://pressthink.org/feed/')]
|
||||||
|
|
||||||
|
def preprocess_html(self, soup):
|
||||||
|
for item in soup.findAll(style=True):
|
||||||
|
del item['style']
|
||||||
|
for item in soup.findAll('img', alt=False):
|
||||||
|
item['alt'] = 'image'
|
||||||
|
for alink in soup.findAll('a'):
|
||||||
|
if alink.string is not None:
|
||||||
|
tstr = alink.string
|
||||||
|
alink.replaceWith(tstr)
|
||||||
|
return soup
|
||||||
|
|
||||||
|
|
@ -21,17 +21,54 @@ class SeattleTimes(BasicNewsRecipe):
|
|||||||
encoding = 'cp1252'
|
encoding = 'cp1252'
|
||||||
language = 'en'
|
language = 'en'
|
||||||
|
|
||||||
|
feeds = [
|
||||||
html2lrf_options = [
|
(u'Top Stories',
|
||||||
'--comment' , description
|
u'http://seattletimes.nwsource.com/rss/home.xml'),
|
||||||
, '--category' , category
|
#(u'Articles', u'http://seattletimes.nwsource.com/rss/seattletimes.xml')
|
||||||
, '--publisher', publisher
|
(u'Business & Technology',
|
||||||
|
u'http://seattletimes.nwsource.com/rss/businesstechnology.xml'),
|
||||||
|
(u'Personal Technology',
|
||||||
|
u'http://seattletimes.nwsource.com/rss/personaltechnology.xml'),
|
||||||
|
(u'Entertainment & the Arts',
|
||||||
|
u'http://seattletimes.nwsource.com/rss/artsentertainment.xml'),
|
||||||
|
(u'Health',
|
||||||
|
u'http://seattletimes.nwsource.com/rss/health.xml'),
|
||||||
|
(u'Living',
|
||||||
|
u'http://seattletimes.nwsource.com/rss/living.xml'),
|
||||||
|
(u'Local News',
|
||||||
|
u'http://seattletimes.nwsource.com/rss/localnews.xml'),
|
||||||
|
(u'Nation & World',
|
||||||
|
u'http://seattletimes.nwsource.com/rss/nationworld.xml'),
|
||||||
|
(u'Opinion',
|
||||||
|
u'http://seattletimes.nwsource.com/rss/opinion.xml'),
|
||||||
|
(u'Politics',
|
||||||
|
u'http://seattletimes.nwsource.com/rss/politics.xml'),
|
||||||
|
(u'Sports',
|
||||||
|
u'http://seattletimes.nwsource.com/rss/sports.xml'),
|
||||||
|
(u'Nicole Brodeur',
|
||||||
|
u'http://seattletimes.nwsource.com/rss/nicolebrodeur.xml'),
|
||||||
|
(u'Danny Westneat',
|
||||||
|
u'http://seattletimes.nwsource.com/rss/dannywestneat.xml'),
|
||||||
|
(u'Jerry Large',
|
||||||
|
u'http://seattletimes.nwsource.com/rss/jerrylarge.xml'),
|
||||||
|
(u'Ron Judd',
|
||||||
|
u'http://seattletimes.nwsource.com/rss/ronjudd.xml'),
|
||||||
|
(u'Education',
|
||||||
|
u'http://seattletimes.nwsource.com/rss/education.xml'),
|
||||||
|
(u'Letters to the Editor',
|
||||||
|
u'http://seattletimes.nwsource.com/rss/northwestvoices.xml'),
|
||||||
|
(u'Travel',
|
||||||
|
u'http://seattletimes.nwsource.com/rss/travel.xml'),
|
||||||
|
(u'Outdoors',
|
||||||
|
u'http://seattletimes.nwsource.com/rss/outdoors.xml'),
|
||||||
|
(u'Steve Kelley',
|
||||||
|
u'http://seattletimes.nwsource.com/rss/stevekelley.xml'),
|
||||||
|
(u'Jerry Brewer',
|
||||||
|
u'http://seattletimes.nwsource.com/rss/jerrybrewer.xml'),
|
||||||
|
(u'Most Read Articles',
|
||||||
|
u'http://seattletimes.nwsource.com/rss/mostreadarticles.xml'),
|
||||||
]
|
]
|
||||||
|
|
||||||
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
|
|
||||||
|
|
||||||
feeds = [(u'Articles', u'http://seattletimes.nwsource.com/rss/seattletimes.xml')]
|
|
||||||
|
|
||||||
remove_tags = [
|
remove_tags = [
|
||||||
dict(name=['object','link','script'])
|
dict(name=['object','link','script'])
|
||||||
,dict(name='p', attrs={'class':'permission'})
|
,dict(name='p', attrs={'class':'permission'})
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||||
from calibre.ebooks.BeautifulSoup import BeautifulSoup
|
#from calibre.ebooks.BeautifulSoup import BeautifulSoup
|
||||||
from urllib import quote
|
from urllib import quote
|
||||||
|
|
||||||
class SportsIllustratedRecipe(BasicNewsRecipe) :
|
class SportsIllustratedRecipe(BasicNewsRecipe) :
|
||||||
@ -91,7 +91,7 @@ class SportsIllustratedRecipe(BasicNewsRecipe) :
|
|||||||
# expire : no idea what value to use
|
# expire : no idea what value to use
|
||||||
# All this comes from the Javascript function that redirects to the print version. It's called PT() and is defined in the file 48.js
|
# All this comes from the Javascript function that redirects to the print version. It's called PT() and is defined in the file 48.js
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
'''def preprocess_html(self, soup):
|
||||||
header = soup.find('div', attrs = {'class' : 'siv_artheader'})
|
header = soup.find('div', attrs = {'class' : 'siv_artheader'})
|
||||||
homeMadeSoup = BeautifulSoup('<html><head></head><body></body></html>')
|
homeMadeSoup = BeautifulSoup('<html><head></head><body></body></html>')
|
||||||
body = homeMadeSoup.body
|
body = homeMadeSoup.body
|
||||||
@ -115,4 +115,5 @@ class SportsIllustratedRecipe(BasicNewsRecipe) :
|
|||||||
body.append(para)
|
body.append(para)
|
||||||
|
|
||||||
return homeMadeSoup
|
return homeMadeSoup
|
||||||
|
'''
|
||||||
|
|
||||||
|
@ -35,7 +35,6 @@ class TechnologyReview(BasicNewsRecipe):
|
|||||||
def get_article_url(self, article):
|
def get_article_url(self, article):
|
||||||
return article.get('guid', article.get('id', None))
|
return article.get('guid', article.get('id', None))
|
||||||
|
|
||||||
|
|
||||||
def print_version(self, url):
|
def print_version(self, url):
|
||||||
baseurl='http://www.technologyreview.com/printer_friendly_article.aspx?id='
|
baseurl='http://www.technologyreview.com/printer_friendly_article.aspx?id='
|
||||||
split1 = string.split(url,"/")
|
split1 = string.split(url,"/")
|
||||||
@ -43,3 +42,25 @@ class TechnologyReview(BasicNewsRecipe):
|
|||||||
split2= string.split(xxx,"/")
|
split2= string.split(xxx,"/")
|
||||||
s = baseurl + split2[0]
|
s = baseurl + split2[0]
|
||||||
return s
|
return s
|
||||||
|
|
||||||
|
|
||||||
|
def postprocess_html(self,soup, True):
|
||||||
|
#remove picture
|
||||||
|
headerhtml = soup.find(True, {'class':'header'})
|
||||||
|
headerhtml.replaceWith("")
|
||||||
|
|
||||||
|
#remove close button
|
||||||
|
closehtml = soup.find(True, {'class':'close'})
|
||||||
|
closehtml.replaceWith("")
|
||||||
|
|
||||||
|
#remove banner advertisement
|
||||||
|
bannerhtml = soup.find(True, {'class':'bannerad'})
|
||||||
|
bannerhtml.replaceWith("")
|
||||||
|
|
||||||
|
#thanks kiklop74! This code removes all links from the text
|
||||||
|
for alink in soup.findAll('a'):
|
||||||
|
if alink.string is not None:
|
||||||
|
tstr = alink.string
|
||||||
|
alink.replaceWith(tstr)
|
||||||
|
|
||||||
|
return soup
|
||||||
|
25
resources/recipes/tri_city_herald.recipe
Normal file
25
resources/recipes/tri_city_herald.recipe
Normal file
@ -0,0 +1,25 @@
|
|||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class TriCityHeraldRecipe(BasicNewsRecipe):
|
||||||
|
title = u'Tri-City Herald'
|
||||||
|
description = 'The Tri-City Herald Mid-Columbia.'
|
||||||
|
language = 'en'
|
||||||
|
__author__ = 'Laura Gjovaag'
|
||||||
|
oldest_article = 1.5
|
||||||
|
max_articles_per_feed = 100
|
||||||
|
no_stylesheets = True
|
||||||
|
remove_javascript = True
|
||||||
|
keep_only_tags = [
|
||||||
|
dict(name='div', attrs={'id':'story_header'}),
|
||||||
|
dict(name='img', attrs={'class':'imageCycle'}),
|
||||||
|
dict(name='div', attrs={'id':['cycleImageCaption', 'story_body']})
|
||||||
|
]
|
||||||
|
remove_tags = [
|
||||||
|
dict(name='div', attrs={'id':'story_mlt'}),
|
||||||
|
dict(name='a', attrs={'id':'commentCount'}),
|
||||||
|
dict(name=['script', 'noscript', 'style'])]
|
||||||
|
extra_css = 'h1{font: bold 140%;} #cycleImageCaption{font: monospace 60%}'
|
||||||
|
|
||||||
|
feeds = [
|
||||||
|
(u'Tri-City Herald Mid-Columbia', u'http://www.tri-cityherald.com/901/index.rss')
|
||||||
|
]
|
80
resources/recipes/tyzden.recipe
Normal file
80
resources/recipes/tyzden.recipe
Normal file
@ -0,0 +1,80 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
__license__ = 'GPL v3'
|
||||||
|
__copyright__ = '2011, Miroslav Vasko zemiak@gmail.com'
|
||||||
|
|
||||||
|
'''
|
||||||
|
.tyzden, a weekly news magazine (a week old issue)
|
||||||
|
'''
|
||||||
|
from calibre import strftime
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
from datetime import date
|
||||||
|
import re
|
||||||
|
|
||||||
|
class TyzdenRecipe(BasicNewsRecipe):
|
||||||
|
__license__ = 'GPL v3'
|
||||||
|
__author__ = 'zemiak'
|
||||||
|
language = 'sk'
|
||||||
|
version = 1
|
||||||
|
|
||||||
|
publisher = u'www.tyzden.sk'
|
||||||
|
category = u'Magazine'
|
||||||
|
description = u'A conservative weekly magazine. The latest free issue'
|
||||||
|
|
||||||
|
today = date.today()
|
||||||
|
iso = today.isocalendar()
|
||||||
|
year = iso[0]
|
||||||
|
weeknum = iso[1]
|
||||||
|
|
||||||
|
if (weeknum > 1):
|
||||||
|
weeknum -= 1
|
||||||
|
|
||||||
|
title = u'tyzden'
|
||||||
|
|
||||||
|
base_url_path = 'http://www.tyzden.sk/casopis/' + str(year) + '/' + str(weeknum)
|
||||||
|
base_url = base_url_path + '.html'
|
||||||
|
|
||||||
|
oldest_article = 20
|
||||||
|
max_articles_per_feed = 100
|
||||||
|
remove_javascript = True
|
||||||
|
|
||||||
|
use_embedded_content = False
|
||||||
|
no_stylesheets = True
|
||||||
|
|
||||||
|
keep_only_tags = []
|
||||||
|
keep_only_tags.append(dict(name = 'h1'))
|
||||||
|
keep_only_tags.append(dict(name = 'div', attrs = {'class': 'text_area top_nofoto'}))
|
||||||
|
keep_only_tags.append(dict(name = 'div', attrs = {'class': 'text_block'}))
|
||||||
|
|
||||||
|
remove_tags_after = [dict(name = 'div', attrs = {'class': 'text_block'})]
|
||||||
|
|
||||||
|
def find_sections(self):
|
||||||
|
soup = self.index_to_soup(self.base_url)
|
||||||
|
# find cover pic
|
||||||
|
imgdiv = soup.find('div', attrs = {'class': 'foto'})
|
||||||
|
if imgdiv is not None:
|
||||||
|
img = imgdiv.find('img')
|
||||||
|
if img is not None:
|
||||||
|
self.cover_url = 'http://www.tyzden.sk/' + img['src']
|
||||||
|
# end find cover pic
|
||||||
|
|
||||||
|
for s in soup.findAll('a', attrs={'href': re.compile(r'rubrika/.*')}):
|
||||||
|
yield (self.tag_to_string(s), s)
|
||||||
|
|
||||||
|
def find_articles(self, soup):
|
||||||
|
for art in soup.findAllNext('a'):
|
||||||
|
if (not art['href'].startswith('casopis/')):
|
||||||
|
break;
|
||||||
|
|
||||||
|
url = art['href']
|
||||||
|
title = self.tag_to_string(art)
|
||||||
|
yield {
|
||||||
|
'title': title, 'url':self.base_url_path + '/' + url, 'description':title,
|
||||||
|
'date' : strftime('%a, %d %b'),
|
||||||
|
}
|
||||||
|
|
||||||
|
def parse_index(self):
|
||||||
|
feeds = []
|
||||||
|
for title, soup in self.find_sections():
|
||||||
|
feeds.append((title, list(self.find_articles(soup))))
|
||||||
|
|
||||||
|
return feeds
|
29
resources/recipes/wichita_eagle.recipe
Normal file
29
resources/recipes/wichita_eagle.recipe
Normal file
@ -0,0 +1,29 @@
|
|||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class AdvancedUserRecipe1294938721(BasicNewsRecipe):
|
||||||
|
title = u'Wichita Eagle'
|
||||||
|
language = 'en'
|
||||||
|
__author__ = 'Jason Cameron'
|
||||||
|
description = 'Daily news from the Wichita Eagle'
|
||||||
|
oldest_article = 1
|
||||||
|
max_articles_per_feed = 30
|
||||||
|
keep_only_tags = [dict(name='div', attrs={'id':'wide'})]
|
||||||
|
feeds = [
|
||||||
|
(u'Local News',
|
||||||
|
u'http://www.kansas.com/news/local/index.rss'),
|
||||||
|
(u'National News',
|
||||||
|
u'http://www.kansas.com/news/nation-world/index.rss'),
|
||||||
|
(u'Sports',
|
||||||
|
u'http://www.kansas.com/sports/index.rss'),
|
||||||
|
(u'Opinion',
|
||||||
|
u'http://www.kansas.com/opinion/index.rss'),
|
||||||
|
(u'Life',
|
||||||
|
u'http://www.kansas.com/living/index.rss'),
|
||||||
|
(u'Entertainment',
|
||||||
|
u'http://www.kansas.com/entertainment/index.rss')
|
||||||
|
]
|
||||||
|
|
||||||
|
def print_version(self, url):
|
||||||
|
urlparts = url.split('/')
|
||||||
|
newadd = urlparts[5]+'/v-print'
|
||||||
|
return url.replace(url, newadd.join(url.split(urlparts[5])))
|
@ -2,8 +2,10 @@
|
|||||||
__license__ = 'GPL v3'
|
__license__ = 'GPL v3'
|
||||||
__docformat__ = 'restructuredtext en'
|
__docformat__ = 'restructuredtext en'
|
||||||
|
|
||||||
|
import re
|
||||||
|
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
from calibre.ebooks.chardet import xml_to_unicode
|
||||||
|
|
||||||
class Wired_Daily(BasicNewsRecipe):
|
class Wired_Daily(BasicNewsRecipe):
|
||||||
|
|
||||||
@ -15,30 +17,43 @@ class Wired_Daily(BasicNewsRecipe):
|
|||||||
|
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
|
|
||||||
|
preprocess_regexps = [(re.compile(r'<head.*</head>', re.DOTALL), lambda m:
|
||||||
|
'<head></head>')]
|
||||||
|
|
||||||
remove_tags_before = dict(name='div', id='content')
|
remove_tags_before = dict(name='div', id='content')
|
||||||
remove_tags = [dict(id=['social_tools', 'outerWrapper', 'sidebar',
|
remove_tags = [dict(id=['header', 'commenting_module', 'post_nav',
|
||||||
'footer', 'advertisement', 'blog_subscription_unit',
|
'social_tools', 'sidebar', 'footer', 'social_wishlist', 'pgwidget',
|
||||||
'brightcove_component']),
|
'outerWrapper', 'inf_widget']),
|
||||||
{'class':'entryActions'},
|
{'class':['entryActions', 'advertisement', 'entryTags']},
|
||||||
dict(name=['noscript', 'script'])]
|
dict(name=['noscript', 'script']),
|
||||||
|
dict(name='h4', attrs={'class':re.compile(r'rat\d+')}),
|
||||||
|
{'class':lambda x: x and x.startswith('contentjump')},
|
||||||
|
dict(name='li', attrs={'class':['entryCategories', 'entryEdit']})]
|
||||||
|
|
||||||
|
|
||||||
feeds = [
|
feeds = [
|
||||||
('Top News', 'http://feeds.wired.com/wired/index'),
|
('Top News', 'http://feeds.wired.com/wired/index'),
|
||||||
('Culture', 'http://feeds.wired.com/wired/culture'),
|
('Product Reviews',
|
||||||
('Software', 'http://feeds.wired.com/wired/software'),
|
'http://www.wired.com/reviews/feeds/latestProductsRss'),
|
||||||
('Mac', 'http://feeds.feedburner.com/cultofmac/bFow'),
|
('Autopia', 'http://www.wired.com/autopia/feed/'),
|
||||||
('Gadgets', 'http://feeds.wired.com/wired/gadgets'),
|
('Danger Room', 'http://www.wired.com/dangerroom/feed/'),
|
||||||
('Cars', 'http://feeds.wired.com/wired/cars'),
|
('Epicenter', 'http://www.wired.com/epicenter/feed/'),
|
||||||
('Entertainment', 'http://feeds.wired.com/wired/entertainment'),
|
('Gadget Lab', 'http://www.wired.com/gadgetlab/feed/'),
|
||||||
('Gaming', 'http://feeds.wired.com/wired/gaming'),
|
('Geek Dad', 'http://www.wired.com/geekdad/feed/'),
|
||||||
('Science', 'http://feeds.wired.com/wired/science'),
|
('Playbook', 'http://www.wired.com/playbook/feed/'),
|
||||||
('Med Tech', 'http://feeds.wired.com/wired/medtech'),
|
('Rawfile', 'http://www.wired.com/rawfile/feed/'),
|
||||||
('Politics', 'http://feeds.wired.com/wired/politics'),
|
('This Day in Tech', 'http://www.wired.com/thisdayintech/feed/'),
|
||||||
('Tech Biz', 'http://feeds.wired.com/wired/techbiz'),
|
('Threat Level', 'http://www.wired.com/threatlevel/feed/'),
|
||||||
('Commentary', 'http://feeds.wired.com/wired/commentary'),
|
('Underwire', 'http://www.wired.com/underwire/feed/'),
|
||||||
|
('Web Monkey', 'http://www.webmonkey.com/feed/'),
|
||||||
|
('Science', 'http://www.wired.com/wiredscience/feed/'),
|
||||||
]
|
]
|
||||||
|
|
||||||
|
def populate_article_metadata(self, article, soup, first):
|
||||||
|
if article.text_summary:
|
||||||
|
article.text_summary = xml_to_unicode(article.text_summary,
|
||||||
|
resolve_entities=True)[0]
|
||||||
|
|
||||||
def print_version(self, url):
|
def print_version(self, url):
|
||||||
return url.replace('http://www.wired.com/', 'http://www.wired.com/print/')
|
return url + '/all/1'
|
||||||
|
|
||||||
|
|
||||||
|
21
resources/recipes/yakima_herald.recipe
Normal file
21
resources/recipes/yakima_herald.recipe
Normal file
@ -0,0 +1,21 @@
|
|||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class YakimaHeraldRepublicRecipe(BasicNewsRecipe):
|
||||||
|
title = u'Yakima Herald-Republic'
|
||||||
|
description = 'The Yakima Herald-Republic.'
|
||||||
|
language = 'en'
|
||||||
|
__author__ = 'Laura Gjovaag'
|
||||||
|
oldest_article = 1.5
|
||||||
|
max_articles_per_feed = 100
|
||||||
|
no_stylesheets = True
|
||||||
|
remove_javascript = True
|
||||||
|
keep_only_tags = [
|
||||||
|
dict(name='div', attrs={'id':['searchleft', 'headline_credit']}),
|
||||||
|
dict(name='div', attrs={'class':['photo', 'cauthor', 'photocredit']}),
|
||||||
|
dict(name='div', attrs={'id':['content_body', 'footerleft']})
|
||||||
|
]
|
||||||
|
extra_css = '.cauthor {font: monospace 60%;} .photocredit {font: monospace 60%}'
|
||||||
|
|
||||||
|
feeds = [
|
||||||
|
(u'Yakima Herald Online', u'http://feeds.feedburner.com/yhronlinenews'),
|
||||||
|
]
|
33
resources/recipes/zerohedge.recipe
Normal file
33
resources/recipes/zerohedge.recipe
Normal file
@ -0,0 +1,33 @@
|
|||||||
|
__license__ = 'GPL v3'
|
||||||
|
__copyright__ = '2011, Darko Miletic <darko.miletic at gmail.com>'
|
||||||
|
'''
|
||||||
|
www.zerohedge.com
|
||||||
|
'''
|
||||||
|
|
||||||
|
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||||
|
|
||||||
|
class ZeroHedge(BasicNewsRecipe):
|
||||||
|
title = 'Zero Hedge'
|
||||||
|
__author__ = 'Darko Miletic'
|
||||||
|
description = 'On a long enough timeline the survival rate for everyone drops to zero'
|
||||||
|
oldest_article = 10
|
||||||
|
max_articles_per_feed = 100
|
||||||
|
no_stylesheets = True
|
||||||
|
use_embedded_content = True
|
||||||
|
encoding = 'utf8'
|
||||||
|
publisher = 'zero hedge'
|
||||||
|
category = 'news, USA, world, economy, politics'
|
||||||
|
language = 'en'
|
||||||
|
masthead_url = 'http://www.zerohedge.com/themes/newsflash/logo.png'
|
||||||
|
publication_type = 'blog'
|
||||||
|
extra_css = 'body{ font-family: sans-serif }'
|
||||||
|
|
||||||
|
conversion_options = {
|
||||||
|
'comments' : description
|
||||||
|
,'tags' : category
|
||||||
|
,'language' : language
|
||||||
|
,'publisher': publisher
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
feeds = [(u'Articles', u'http://feeds.feedburner.com/zerohedge/feed')]
|
28
resources/template-functions.json
Normal file
28
resources/template-functions.json
Normal file
@ -0,0 +1,28 @@
|
|||||||
|
{
|
||||||
|
"contains": "def evaluate(self, formatter, kwargs, mi, locals,\n val, test, value_if_present, value_if_not):\n if re.search(test, val):\n return value_if_present\n else:\n return value_if_not\n",
|
||||||
|
"divide": "def evaluate(self, formatter, kwargs, mi, locals, x, y):\n x = float(x if x else 0)\n y = float(y if y else 0)\n return unicode(x / y)\n",
|
||||||
|
"uppercase": "def evaluate(self, formatter, kwargs, mi, locals, val):\n return val.upper()\n",
|
||||||
|
"strcat": "def evaluate(self, formatter, kwargs, mi, locals, *args):\n i = 0\n res = ''\n for i in range(0, len(args)):\n res += args[i]\n return res\n",
|
||||||
|
"substr": "def evaluate(self, formatter, kwargs, mi, locals, str_, start_, end_):\n return str_[int(start_): len(str_) if int(end_) == 0 else int(end_)]\n",
|
||||||
|
"ifempty": "def evaluate(self, formatter, kwargs, mi, locals, val, value_if_empty):\n if val:\n return val\n else:\n return value_if_empty\n",
|
||||||
|
"field": "def evaluate(self, formatter, kwargs, mi, locals, name):\n return formatter.get_value(name, [], kwargs)\n",
|
||||||
|
"capitalize": "def evaluate(self, formatter, kwargs, mi, locals, val):\n return capitalize(val)\n",
|
||||||
|
"list_item": "def evaluate(self, formatter, kwargs, mi, locals, val, index, sep):\n if not val:\n return ''\n index = int(index)\n val = val.split(sep)\n try:\n return val[index]\n except:\n return ''\n",
|
||||||
|
"shorten": "def evaluate(self, formatter, kwargs, mi, locals,\n val, leading, center_string, trailing):\n l = max(0, int(leading))\n t = max(0, int(trailing))\n if len(val) > l + len(center_string) + t:\n return val[0:l] + center_string + ('' if t == 0 else val[-t:])\n else:\n return val\n",
|
||||||
|
"re": "def evaluate(self, formatter, kwargs, mi, locals, val, pattern, replacement):\n return re.sub(pattern, replacement, val)\n",
|
||||||
|
"add": "def evaluate(self, formatter, kwargs, mi, locals, x, y):\n x = float(x if x else 0)\n y = float(y if y else 0)\n return unicode(x + y)\n",
|
||||||
|
"lookup": "def evaluate(self, formatter, kwargs, mi, locals, val, *args):\n if len(args) == 2: # here for backwards compatibility\n if val:\n return formatter.vformat('{'+args[0].strip()+'}', [], kwargs)\n else:\n return formatter.vformat('{'+args[1].strip()+'}', [], kwargs)\n if (len(args) % 2) != 1:\n raise ValueError(_('lookup requires either 2 or an odd number of arguments'))\n i = 0\n while i < len(args):\n if i + 1 >= len(args):\n return formatter.vformat('{' + args[i].strip() + '}', [], kwargs)\n if re.search(args[i], val):\n return formatter.vformat('{'+args[i+1].strip() + '}', [], kwargs)\n i += 2\n",
|
||||||
|
"template": "def evaluate(self, formatter, kwargs, mi, locals, template):\n template = template.replace('[[', '{').replace(']]', '}')\n return formatter.safe_format(template, kwargs, 'TEMPLATE', mi)\n",
|
||||||
|
"print": "def evaluate(self, formatter, kwargs, mi, locals, *args):\n print args\n return None\n",
|
||||||
|
"titlecase": "def evaluate(self, formatter, kwargs, mi, locals, val):\n return titlecase(val)\n",
|
||||||
|
"test": "def evaluate(self, formatter, kwargs, mi, locals, val, value_if_set, value_not_set):\n if val:\n return value_if_set\n else:\n return value_not_set\n",
|
||||||
|
"eval": "def evaluate(self, formatter, kwargs, mi, locals, template):\n from formatter import eval_formatter\n template = template.replace('[[', '{').replace(']]', '}')\n return eval_formatter.safe_format(template, locals, 'EVAL', None)\n",
|
||||||
|
"multiply": "def evaluate(self, formatter, kwargs, mi, locals, x, y):\n x = float(x if x else 0)\n y = float(y if y else 0)\n return unicode(x * y)\n",
|
||||||
|
"subtract": "def evaluate(self, formatter, kwargs, mi, locals, x, y):\n x = float(x if x else 0)\n y = float(y if y else 0)\n return unicode(x - y)\n",
|
||||||
|
"count": "def evaluate(self, formatter, kwargs, mi, locals, val, sep):\n return unicode(len(val.split(sep)))\n",
|
||||||
|
"lowercase": "def evaluate(self, formatter, kwargs, mi, locals, val):\n return val.lower()\n",
|
||||||
|
"assign": "def evaluate(self, formatter, kwargs, mi, locals, target, value):\n locals[target] = value\n return value\n",
|
||||||
|
"switch": "def evaluate(self, formatter, kwargs, mi, locals, val, *args):\n if (len(args) % 2) != 1:\n raise ValueError(_('switch requires an odd number of arguments'))\n i = 0\n while i < len(args):\n if i + 1 >= len(args):\n return args[i]\n if re.search(args[i], val):\n return args[i+1]\n i += 2\n",
|
||||||
|
"strcmp": "def evaluate(self, formatter, kwargs, mi, locals, x, y, lt, eq, gt):\n v = strcmp(x, y)\n if v < 0:\n return lt\n if v == 0:\n return eq\n return gt\n",
|
||||||
|
"cmp": "def evaluate(self, formatter, kwargs, mi, locals, x, y, lt, eq, gt):\n x = float(x if x else 0)\n y = float(y if y else 0)\n if x < y:\n return lt\n if x == y:\n return eq\n return gt\n"
|
||||||
|
}
|
@ -287,7 +287,7 @@
|
|||||||
<xsl:value-of select="count(preceding::rtf:footnote) + 1"/>
|
<xsl:value-of select="count(preceding::rtf:footnote) + 1"/>
|
||||||
<xsl:text>]</xsl:text>
|
<xsl:text>]</xsl:text>
|
||||||
</xsl:when>
|
</xsl:when>
|
||||||
<xsl:when test="(@superscript = 'true')">
|
<xsl:when test="(@superscript)">
|
||||||
<xsl:element name="sup">
|
<xsl:element name="sup">
|
||||||
<xsl:element name="span">
|
<xsl:element name="span">
|
||||||
<xsl:attribute name="class">
|
<xsl:attribute name="class">
|
||||||
@ -297,7 +297,7 @@
|
|||||||
</xsl:element>
|
</xsl:element>
|
||||||
</xsl:element>
|
</xsl:element>
|
||||||
</xsl:when>
|
</xsl:when>
|
||||||
<xsl:when test="(@underscript = 'true')">
|
<xsl:when test="(@underscript or @subscript)">
|
||||||
<xsl:element name="sub">
|
<xsl:element name="sub">
|
||||||
<xsl:element name="span">
|
<xsl:element name="span">
|
||||||
<xsl:attribute name="class">
|
<xsl:attribute name="class">
|
||||||
|
@ -117,7 +117,6 @@ if iswindows:
|
|||||||
poppler_inc_dirs = consolidate('POPPLER_INC_DIR',
|
poppler_inc_dirs = consolidate('POPPLER_INC_DIR',
|
||||||
r'%s\poppler;%s'%(sw_inc_dir, sw_inc_dir))
|
r'%s\poppler;%s'%(sw_inc_dir, sw_inc_dir))
|
||||||
|
|
||||||
popplerqt4_inc_dirs = poppler_inc_dirs + [poppler_inc_dirs[1]+r'\qt4']
|
|
||||||
poppler_lib_dirs = consolidate('POPPLER_LIB_DIR', sw_lib_dir)
|
poppler_lib_dirs = consolidate('POPPLER_LIB_DIR', sw_lib_dir)
|
||||||
popplerqt4_lib_dirs = poppler_lib_dirs
|
popplerqt4_lib_dirs = poppler_lib_dirs
|
||||||
poppler_libs = ['poppler']
|
poppler_libs = ['poppler']
|
||||||
@ -131,7 +130,6 @@ elif isosx:
|
|||||||
fc_lib = '/sw/lib'
|
fc_lib = '/sw/lib'
|
||||||
poppler_inc_dirs = consolidate('POPPLER_INC_DIR',
|
poppler_inc_dirs = consolidate('POPPLER_INC_DIR',
|
||||||
'/sw/build/poppler-0.14.5/poppler:/sw/build/poppler-0.14.5')
|
'/sw/build/poppler-0.14.5/poppler:/sw/build/poppler-0.14.5')
|
||||||
popplerqt4_inc_dirs = poppler_inc_dirs + [poppler_inc_dirs[0]+'/qt4']
|
|
||||||
poppler_lib_dirs = consolidate('POPPLER_LIB_DIR',
|
poppler_lib_dirs = consolidate('POPPLER_LIB_DIR',
|
||||||
'/sw/lib')
|
'/sw/lib')
|
||||||
poppler_libs = ['poppler']
|
poppler_libs = ['poppler']
|
||||||
@ -150,9 +148,6 @@ else:
|
|||||||
# Include directories
|
# Include directories
|
||||||
poppler_inc_dirs = pkgconfig_include_dirs('poppler',
|
poppler_inc_dirs = pkgconfig_include_dirs('poppler',
|
||||||
'POPPLER_INC_DIR', '/usr/include/poppler')
|
'POPPLER_INC_DIR', '/usr/include/poppler')
|
||||||
popplerqt4_inc_dirs = pkgconfig_include_dirs('poppler-qt4', '', '')
|
|
||||||
if not popplerqt4_inc_dirs:
|
|
||||||
popplerqt4_inc_dirs = poppler_inc_dirs + [poppler_inc_dirs[0]+'/qt4']
|
|
||||||
png_inc_dirs = pkgconfig_include_dirs('libpng', 'PNG_INC_DIR',
|
png_inc_dirs = pkgconfig_include_dirs('libpng', 'PNG_INC_DIR',
|
||||||
'/usr/include')
|
'/usr/include')
|
||||||
magick_inc_dirs = pkgconfig_include_dirs('MagickWand', 'MAGICK_INC', '/usr/include/ImageMagick')
|
magick_inc_dirs = pkgconfig_include_dirs('MagickWand', 'MAGICK_INC', '/usr/include/ImageMagick')
|
||||||
@ -187,20 +182,17 @@ if not poppler_inc_dirs or not os.path.exists(
|
|||||||
poppler_error = \
|
poppler_error = \
|
||||||
('Poppler not found on your system. Various PDF related',
|
('Poppler not found on your system. Various PDF related',
|
||||||
' functionality will not work. Use the POPPLER_INC_DIR and',
|
' functionality will not work. Use the POPPLER_INC_DIR and',
|
||||||
' POPPLER_LIB_DIR environment variables.')
|
' POPPLER_LIB_DIR environment variables. calibre requires '
|
||||||
|
' the poppler XPDF headers. If your distro does not '
|
||||||
popplerqt4_error = None
|
' include them you will have to re-compile poppler '
|
||||||
if not popplerqt4_inc_dirs or not os.path.exists(
|
' by hand with --enable-xpdf-headers')
|
||||||
os.path.join(popplerqt4_inc_dirs[-1], 'poppler-qt4.h')):
|
|
||||||
popplerqt4_error = \
|
|
||||||
('Poppler Qt4 bindings not found on your system.')
|
|
||||||
|
|
||||||
magick_error = None
|
magick_error = None
|
||||||
if not magick_inc_dirs or not os.path.exists(os.path.join(magick_inc_dirs[0],
|
if not magick_inc_dirs or not os.path.exists(os.path.join(magick_inc_dirs[0],
|
||||||
'wand')):
|
'wand')):
|
||||||
magick_error = ('ImageMagick not found on your system. '
|
magick_error = ('ImageMagick not found on your system. '
|
||||||
'Try setting the environment variables MAGICK_INC '
|
'Try setting the environment variables MAGICK_INC '
|
||||||
'and MAGICK_LIB to help calibre locate the inclue and libbrary '
|
'and MAGICK_LIB to help calibre locate the include and library '
|
||||||
'files.')
|
'files.')
|
||||||
|
|
||||||
podofo_lib = os.environ.get('PODOFO_LIB_DIR', podofo_lib)
|
podofo_lib = os.environ.get('PODOFO_LIB_DIR', podofo_lib)
|
||||||
|
@ -43,8 +43,9 @@ class Stage3(Command):
|
|||||||
|
|
||||||
description = 'Stage 3 of the publish process'
|
description = 'Stage 3 of the publish process'
|
||||||
sub_commands = ['upload_user_manual', 'upload_demo', 'sdist',
|
sub_commands = ['upload_user_manual', 'upload_demo', 'sdist',
|
||||||
'upload_to_google_code', 'tag_release', 'upload_to_server',
|
'upload_to_google_code', 'upload_to_sourceforge',
|
||||||
'upload_to_sourceforge', 'upload_to_mobileread',
|
'tag_release', 'upload_to_server',
|
||||||
|
'upload_to_mobileread',
|
||||||
]
|
]
|
||||||
|
|
||||||
class Stage4(Command):
|
class Stage4(Command):
|
||||||
|
@ -84,6 +84,23 @@ class Resources(Command):
|
|||||||
|
|
||||||
cPickle.dump(complete, open(dest, 'wb'), -1)
|
cPickle.dump(complete, open(dest, 'wb'), -1)
|
||||||
|
|
||||||
|
self.info('\tCreating template-functions.json')
|
||||||
|
dest = self.j(self.RESOURCES, 'template-functions.json')
|
||||||
|
function_dict = {}
|
||||||
|
import inspect
|
||||||
|
from calibre.utils.formatter_functions import all_builtin_functions
|
||||||
|
for obj in all_builtin_functions:
|
||||||
|
eval_func = inspect.getmembers(obj,
|
||||||
|
lambda x: inspect.ismethod(x) and x.__name__ == 'evaluate')
|
||||||
|
try:
|
||||||
|
lines = [l[4:] for l in inspect.getsourcelines(eval_func[0][1])[0]]
|
||||||
|
except:
|
||||||
|
continue
|
||||||
|
lines = ''.join(lines)
|
||||||
|
function_dict[obj.name] = lines
|
||||||
|
import json
|
||||||
|
json.dump(function_dict, open(dest, 'wb'), indent=4)
|
||||||
|
|
||||||
def clean(self):
|
def clean(self):
|
||||||
for x in ('scripts', 'recipes', 'ebook-convert-complete'):
|
for x in ('scripts', 'recipes', 'ebook-convert-complete'):
|
||||||
x = self.j(self.RESOURCES, x+'.pickle')
|
x = self.j(self.RESOURCES, x+'.pickle')
|
||||||
|
@ -6,7 +6,7 @@ __license__ = 'GPL v3'
|
|||||||
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
|
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||||
__docformat__ = 'restructuredtext en'
|
__docformat__ = 'restructuredtext en'
|
||||||
|
|
||||||
import os, re, cStringIO, base64, httplib, subprocess, hashlib, shutil
|
import os, re, cStringIO, base64, httplib, subprocess, hashlib, shutil, time
|
||||||
from subprocess import check_call
|
from subprocess import check_call
|
||||||
from tempfile import NamedTemporaryFile, mkdtemp
|
from tempfile import NamedTemporaryFile, mkdtemp
|
||||||
|
|
||||||
@ -160,7 +160,7 @@ class UploadToGoogleCode(Command):
|
|||||||
|
|
||||||
return 'multipart/form-data; boundary=%s' % BOUNDARY, CRLF.join(body)
|
return 'multipart/form-data; boundary=%s' % BOUNDARY, CRLF.join(body)
|
||||||
|
|
||||||
def upload(self, fname, desc, labels=[]):
|
def upload(self, fname, desc, labels=[], retry=0):
|
||||||
form_fields = [('summary', desc)]
|
form_fields = [('summary', desc)]
|
||||||
form_fields.extend([('label', l.strip()) for l in labels])
|
form_fields.extend([('label', l.strip()) for l in labels])
|
||||||
|
|
||||||
@ -183,6 +183,10 @@ class UploadToGoogleCode(Command):
|
|||||||
|
|
||||||
print 'Failed to upload with code %d and reason: %s'%(resp.status,
|
print 'Failed to upload with code %d and reason: %s'%(resp.status,
|
||||||
resp.reason)
|
resp.reason)
|
||||||
|
if retry < 1:
|
||||||
|
print 'Retrying in 5 seconds....'
|
||||||
|
time.sleep(5)
|
||||||
|
return self.upload(fname, desc, labels=labels, retry=retry+1)
|
||||||
raise Exception('Failed to upload '+fname)
|
raise Exception('Failed to upload '+fname)
|
||||||
|
|
||||||
|
|
||||||
|
@ -241,7 +241,7 @@ def get_parsed_proxy(typ='http', debug=True):
|
|||||||
return ans
|
return ans
|
||||||
|
|
||||||
|
|
||||||
def browser(honor_time=True, max_time=2, mobile_browser=False):
|
def browser(honor_time=True, max_time=2, mobile_browser=False, user_agent=None):
|
||||||
'''
|
'''
|
||||||
Create a mechanize browser for web scraping. The browser handles cookies,
|
Create a mechanize browser for web scraping. The browser handles cookies,
|
||||||
refresh requests and ignores robots.txt. Also uses proxy if avaialable.
|
refresh requests and ignores robots.txt. Also uses proxy if avaialable.
|
||||||
@ -253,8 +253,10 @@ def browser(honor_time=True, max_time=2, mobile_browser=False):
|
|||||||
opener = Browser()
|
opener = Browser()
|
||||||
opener.set_handle_refresh(True, max_time=max_time, honor_time=honor_time)
|
opener.set_handle_refresh(True, max_time=max_time, honor_time=honor_time)
|
||||||
opener.set_handle_robots(False)
|
opener.set_handle_robots(False)
|
||||||
opener.addheaders = [('User-agent', ' Mozilla/5.0 (Windows; U; Windows CE 5.1; rv:1.8.1a3) Gecko/20060610 Minimo/0.016' if mobile_browser else \
|
if user_agent is None:
|
||||||
'Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.2.13) Gecko/20101210 Gentoo Firefox/3.6.13')]
|
user_agent = ' Mozilla/5.0 (Windows; U; Windows CE 5.1; rv:1.8.1a3) Gecko/20060610 Minimo/0.016' if mobile_browser else \
|
||||||
|
'Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.2.13) Gecko/20101210 Gentoo Firefox/3.6.13'
|
||||||
|
opener.addheaders = [('User-agent', user_agent)]
|
||||||
http_proxy = get_proxies().get('http', None)
|
http_proxy = get_proxies().get('http', None)
|
||||||
if http_proxy:
|
if http_proxy:
|
||||||
opener.set_proxies({'http':http_proxy})
|
opener.set_proxies({'http':http_proxy})
|
||||||
@ -459,6 +461,18 @@ def force_unicode(obj, enc=preferred_encoding):
|
|||||||
obj = obj.decode('utf-8')
|
obj = obj.decode('utf-8')
|
||||||
return obj
|
return obj
|
||||||
|
|
||||||
|
def as_unicode(obj, enc=preferred_encoding):
|
||||||
|
if not isbytestring(obj):
|
||||||
|
try:
|
||||||
|
obj = unicode(obj)
|
||||||
|
except:
|
||||||
|
try:
|
||||||
|
obj = str(obj)
|
||||||
|
except:
|
||||||
|
obj = repr(obj)
|
||||||
|
return force_unicode(obj, enc=enc)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def human_readable(size):
|
def human_readable(size):
|
||||||
""" Convert a size in bytes into a human readable form """
|
""" Convert a size in bytes into a human readable form """
|
||||||
|
@ -2,7 +2,7 @@ __license__ = 'GPL v3'
|
|||||||
__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
|
__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
|
||||||
__docformat__ = 'restructuredtext en'
|
__docformat__ = 'restructuredtext en'
|
||||||
__appname__ = 'calibre'
|
__appname__ = 'calibre'
|
||||||
__version__ = '0.7.37'
|
__version__ = '0.7.42'
|
||||||
__author__ = "Kovid Goyal <kovid@kovidgoyal.net>"
|
__author__ = "Kovid Goyal <kovid@kovidgoyal.net>"
|
||||||
|
|
||||||
import re
|
import re
|
||||||
|
@ -705,13 +705,17 @@ class ActionTweakEpub(InterfaceActionBase):
|
|||||||
name = 'Tweak ePub'
|
name = 'Tweak ePub'
|
||||||
actual_plugin = 'calibre.gui2.actions.tweak_epub:TweakEpubAction'
|
actual_plugin = 'calibre.gui2.actions.tweak_epub:TweakEpubAction'
|
||||||
|
|
||||||
|
class ActionNextMatch(InterfaceActionBase):
|
||||||
|
name = 'Next Match'
|
||||||
|
actual_plugin = 'calibre.gui2.actions.next_match:NextMatchAction'
|
||||||
|
|
||||||
plugins += [ActionAdd, ActionFetchAnnotations, ActionGenerateCatalog,
|
plugins += [ActionAdd, ActionFetchAnnotations, ActionGenerateCatalog,
|
||||||
ActionConvert, ActionDelete, ActionEditMetadata, ActionView,
|
ActionConvert, ActionDelete, ActionEditMetadata, ActionView,
|
||||||
ActionFetchNews, ActionSaveToDisk, ActionShowBookDetails,
|
ActionFetchNews, ActionSaveToDisk, ActionShowBookDetails,
|
||||||
ActionRestart, ActionOpenFolder, ActionConnectShare,
|
ActionRestart, ActionOpenFolder, ActionConnectShare,
|
||||||
ActionSendToDevice, ActionHelp, ActionPreferences, ActionSimilarBooks,
|
ActionSendToDevice, ActionHelp, ActionPreferences, ActionSimilarBooks,
|
||||||
ActionAddToLibrary, ActionEditCollections, ActionChooseLibrary,
|
ActionAddToLibrary, ActionEditCollections, ActionChooseLibrary,
|
||||||
ActionCopyToLibrary, ActionTweakEpub]
|
ActionCopyToLibrary, ActionTweakEpub, ActionNextMatch]
|
||||||
|
|
||||||
# }}}
|
# }}}
|
||||||
|
|
||||||
@ -843,6 +847,17 @@ class Plugboard(PreferencesPlugin):
|
|||||||
config_widget = 'calibre.gui2.preferences.plugboard'
|
config_widget = 'calibre.gui2.preferences.plugboard'
|
||||||
description = _('Change metadata fields before saving/sending')
|
description = _('Change metadata fields before saving/sending')
|
||||||
|
|
||||||
|
class TemplateFunctions(PreferencesPlugin):
|
||||||
|
name = 'TemplateFunctions'
|
||||||
|
icon = I('template_funcs.png')
|
||||||
|
gui_name = _('Template Functions')
|
||||||
|
category = 'Advanced'
|
||||||
|
gui_category = _('Advanced')
|
||||||
|
category_order = 5
|
||||||
|
name_order = 4
|
||||||
|
config_widget = 'calibre.gui2.preferences.template_functions'
|
||||||
|
description = _('Create your own template functions')
|
||||||
|
|
||||||
class Email(PreferencesPlugin):
|
class Email(PreferencesPlugin):
|
||||||
name = 'Email'
|
name = 'Email'
|
||||||
icon = I('mail.png')
|
icon = I('mail.png')
|
||||||
@ -904,6 +919,6 @@ class Misc(PreferencesPlugin):
|
|||||||
|
|
||||||
plugins += [LookAndFeel, Behavior, Columns, Toolbar, InputOptions,
|
plugins += [LookAndFeel, Behavior, Columns, Toolbar, InputOptions,
|
||||||
CommonOptions, OutputOptions, Adding, Saving, Sending, Plugboard,
|
CommonOptions, OutputOptions, Adding, Saving, Sending, Plugboard,
|
||||||
Email, Server, Plugins, Tweaks, Misc]
|
Email, Server, Plugins, Tweaks, Misc, TemplateFunctions]
|
||||||
|
|
||||||
#}}}
|
#}}}
|
||||||
|
@ -160,18 +160,6 @@ class InputFormatPlugin(Plugin):
|
|||||||
'''
|
'''
|
||||||
raise NotImplementedError()
|
raise NotImplementedError()
|
||||||
|
|
||||||
def preprocess_html(self, opts, html):
|
|
||||||
'''
|
|
||||||
This method is called by the conversion pipeline on all HTML before it
|
|
||||||
is parsed. It is meant to be used to do any required preprocessing on
|
|
||||||
the HTML, like removing hard line breaks, etc.
|
|
||||||
|
|
||||||
:param html: A unicode string
|
|
||||||
:return: A unicode string
|
|
||||||
'''
|
|
||||||
return html
|
|
||||||
|
|
||||||
|
|
||||||
def convert(self, stream, options, file_ext, log, accelerators):
|
def convert(self, stream, options, file_ext, log, accelerators):
|
||||||
'''
|
'''
|
||||||
This method must be implemented in sub-classes. It must return
|
This method must be implemented in sub-classes. It must return
|
||||||
|
@ -441,7 +441,7 @@ class TabletOutput(iPadOutput):
|
|||||||
|
|
||||||
class SamsungGalaxy(TabletOutput):
|
class SamsungGalaxy(TabletOutput):
|
||||||
name = 'Samsung Galaxy'
|
name = 'Samsung Galaxy'
|
||||||
shortname = 'galaxy'
|
short_name = 'galaxy'
|
||||||
description = _('Intended for the Samsung Galaxy and similar tablet devices with '
|
description = _('Intended for the Samsung Galaxy and similar tablet devices with '
|
||||||
'a resolution of 600x1280')
|
'a resolution of 600x1280')
|
||||||
screen_size = comic_screen_size = (600, 1280)
|
screen_size = comic_screen_size = (600, 1280)
|
||||||
|
@ -21,21 +21,22 @@ class ANDROID(USBMS):
|
|||||||
# HTC
|
# HTC
|
||||||
0x0bb4 : { 0x0c02 : [0x100, 0x0227, 0x0226], 0x0c01 : [0x100, 0x0227], 0x0ff9
|
0x0bb4 : { 0x0c02 : [0x100, 0x0227, 0x0226], 0x0c01 : [0x100, 0x0227], 0x0ff9
|
||||||
: [0x0100, 0x0227, 0x0226], 0x0c87: [0x0100, 0x0227, 0x0226],
|
: [0x0100, 0x0227, 0x0226], 0x0c87: [0x0100, 0x0227, 0x0226],
|
||||||
0xc92 : [0x100], 0xc97: [0x226]},
|
0xc92 : [0x100], 0xc97: [0x226], 0xc99 : [0x0100]},
|
||||||
|
|
||||||
# Eken
|
# Eken
|
||||||
0x040d : { 0x8510 : [0x0001], 0x0851 : [0x1] },
|
0x040d : { 0x8510 : [0x0001], 0x0851 : [0x1] },
|
||||||
|
|
||||||
# Motorola
|
# Motorola
|
||||||
0x22b8 : { 0x41d9 : [0x216], 0x2d61: [0x100], 0x2d67 : [0x100],
|
0x22b8 : { 0x41d9 : [0x216], 0x2d61 : [0x100], 0x2d67 : [0x100],
|
||||||
0x41db : [0x216], 0x4285 : [0x216], 0x42a3 : [0x216] },
|
0x41db : [0x216], 0x4285 : [0x216], 0x42a3 : [0x216],
|
||||||
|
0x4286 : [0x216], 0x42b3 : [0x216] },
|
||||||
|
|
||||||
# Sony Ericsson
|
# Sony Ericsson
|
||||||
0xfce : { 0xd12e : [0x0100]},
|
0xfce : { 0xd12e : [0x0100]},
|
||||||
|
|
||||||
# Google
|
# Google
|
||||||
0x18d1 : { 0x4e11 : [0x0100, 0x226, 0x227], 0x4e12: [0x0100, 0x226,
|
0x18d1 : { 0x4e11 : [0x0100, 0x226, 0x227], 0x4e12: [0x0100, 0x226,
|
||||||
0x227], 0x4e21: [0x0100, 0x226, 0x227]},
|
0x227], 0x4e21: [0x0100, 0x226, 0x227], 0xb058: [0x0222]},
|
||||||
|
|
||||||
# Samsung
|
# Samsung
|
||||||
0x04e8 : { 0x681d : [0x0222, 0x0223, 0x0224, 0x0400],
|
0x04e8 : { 0x681d : [0x0222, 0x0223, 0x0224, 0x0400],
|
||||||
@ -52,6 +53,9 @@ class ANDROID(USBMS):
|
|||||||
# LG
|
# LG
|
||||||
0x1004 : { 0x61cc : [0x100] },
|
0x1004 : { 0x61cc : [0x100] },
|
||||||
|
|
||||||
|
# Archos
|
||||||
|
0x0e79 : { 0x1419: [0x0216], 0x1420 : [0x0216]},
|
||||||
|
|
||||||
}
|
}
|
||||||
EBOOK_DIR_MAIN = ['eBooks/import', 'wordplayer/calibretransfer', 'Books']
|
EBOOK_DIR_MAIN = ['eBooks/import', 'wordplayer/calibretransfer', 'Books']
|
||||||
EXTRA_CUSTOMIZATION_MESSAGE = _('Comma separated list of directories to '
|
EXTRA_CUSTOMIZATION_MESSAGE = _('Comma separated list of directories to '
|
||||||
@ -60,18 +64,20 @@ class ANDROID(USBMS):
|
|||||||
EXTRA_CUSTOMIZATION_DEFAULT = ', '.join(EBOOK_DIR_MAIN)
|
EXTRA_CUSTOMIZATION_DEFAULT = ', '.join(EBOOK_DIR_MAIN)
|
||||||
|
|
||||||
VENDOR_NAME = ['HTC', 'MOTOROLA', 'GOOGLE_', 'ANDROID', 'ACER',
|
VENDOR_NAME = ['HTC', 'MOTOROLA', 'GOOGLE_', 'ANDROID', 'ACER',
|
||||||
'GT-I5700', 'SAMSUNG', 'DELL', 'LINUX', 'GOOGLE']
|
'GT-I5700', 'SAMSUNG', 'DELL', 'LINUX', 'GOOGLE', 'ARCHOS',
|
||||||
|
'TELECHIP']
|
||||||
WINDOWS_MAIN_MEM = ['ANDROID_PHONE', 'A855', 'A853', 'INC.NEXUS_ONE',
|
WINDOWS_MAIN_MEM = ['ANDROID_PHONE', 'A855', 'A853', 'INC.NEXUS_ONE',
|
||||||
'__UMS_COMPOSITE', '_MB200', 'MASS_STORAGE', '_-_CARD', 'SGH-I897',
|
'__UMS_COMPOSITE', '_MB200', 'MASS_STORAGE', '_-_CARD', 'SGH-I897',
|
||||||
'GT-I9000', 'FILE-STOR_GADGET', 'SGH-T959', 'SAMSUNG_ANDROID',
|
'GT-I9000', 'FILE-STOR_GADGET', 'SGH-T959', 'SAMSUNG_ANDROID',
|
||||||
'SCH-I500_CARD', 'SPH-D700_CARD', 'MB810', 'GT-P1000', 'DESIRE',
|
'SCH-I500_CARD', 'SPH-D700_CARD', 'MB810', 'GT-P1000', 'DESIRE',
|
||||||
'SGH-T849', '_MB300']
|
'SGH-T849', '_MB300', 'A70S', 'S_ANDROID', 'A101IT']
|
||||||
WINDOWS_CARD_A_MEM = ['ANDROID_PHONE', 'GT-I9000_CARD', 'SGH-I897',
|
WINDOWS_CARD_A_MEM = ['ANDROID_PHONE', 'GT-I9000_CARD', 'SGH-I897',
|
||||||
'FILE-STOR_GADGET', 'SGH-T959', 'SAMSUNG_ANDROID', 'GT-P1000_CARD']
|
'FILE-STOR_GADGET', 'SGH-T959', 'SAMSUNG_ANDROID', 'GT-P1000_CARD',
|
||||||
|
'A70S', 'A101IT']
|
||||||
|
|
||||||
OSX_MAIN_MEM = 'HTC Android Phone Media'
|
OSX_MAIN_MEM = 'Android Device Main Memory'
|
||||||
|
|
||||||
MAIN_MEMORY_VOLUME_LABEL = 'Android Phone Internal Memory'
|
MAIN_MEMORY_VOLUME_LABEL = 'Android Device Main Memory'
|
||||||
|
|
||||||
SUPPORTS_SUB_DIRS = True
|
SUPPORTS_SUB_DIRS = True
|
||||||
|
|
||||||
|
@ -178,7 +178,7 @@ class INVESBOOK(EB600):
|
|||||||
|
|
||||||
class BOOQ(EB600):
|
class BOOQ(EB600):
|
||||||
name = 'Booq Device Interface'
|
name = 'Booq Device Interface'
|
||||||
gui_name = 'Booq'
|
gui_name = 'bq Reader'
|
||||||
|
|
||||||
FORMATS = ['epub', 'mobi', 'prc', 'fb2', 'pdf', 'doc', 'rtf', 'txt', 'html']
|
FORMATS = ['epub', 'mobi', 'prc', 'fb2', 'pdf', 'doc', 'rtf', 'txt', 'html']
|
||||||
|
|
||||||
|
@ -27,7 +27,7 @@ class Book(Book_):
|
|||||||
|
|
||||||
self.size = size # will be set later if None
|
self.size = size # will be set later if None
|
||||||
|
|
||||||
if ContentType == '6':
|
if ContentType == '6' and date is not None:
|
||||||
self.datetime = time.strptime(date, "%Y-%m-%dT%H:%M:%S.%f")
|
self.datetime = time.strptime(date, "%Y-%m-%dT%H:%M:%S.%f")
|
||||||
else:
|
else:
|
||||||
try:
|
try:
|
||||||
|
@ -33,8 +33,8 @@ class PALMPRE(USBMS):
|
|||||||
|
|
||||||
class AVANT(USBMS):
|
class AVANT(USBMS):
|
||||||
name = 'Booq Avant Device Interface'
|
name = 'Booq Avant Device Interface'
|
||||||
gui_name = 'Avant'
|
gui_name = 'bq Avant'
|
||||||
description = _('Communicate with the Booq Avant')
|
description = _('Communicate with the Bq Avant')
|
||||||
author = 'Kovid Goyal'
|
author = 'Kovid Goyal'
|
||||||
supported_platforms = ['windows', 'osx', 'linux']
|
supported_platforms = ['windows', 'osx', 'linux']
|
||||||
|
|
||||||
@ -106,7 +106,7 @@ class PDNOVEL(USBMS):
|
|||||||
WINDOWS_MAIN_MEM = WINDOWS_CARD_A_MEM = '__UMS_COMPOSITE'
|
WINDOWS_MAIN_MEM = WINDOWS_CARD_A_MEM = '__UMS_COMPOSITE'
|
||||||
THUMBNAIL_HEIGHT = 130
|
THUMBNAIL_HEIGHT = 130
|
||||||
|
|
||||||
EBOOK_DIR_MAIN = 'eBooks'
|
EBOOK_DIR_MAIN = EBOOK_DIR_CARD_A = 'eBooks'
|
||||||
SUPPORTS_SUB_DIRS = False
|
SUPPORTS_SUB_DIRS = False
|
||||||
DELETE_EXTS = ['.jpg', '.jpeg', '.png']
|
DELETE_EXTS = ['.jpg', '.jpeg', '.png']
|
||||||
|
|
||||||
@ -193,6 +193,9 @@ class LUMIREAD(USBMS):
|
|||||||
|
|
||||||
THUMBNAIL_HEIGHT = 200
|
THUMBNAIL_HEIGHT = 200
|
||||||
|
|
||||||
|
VENDOR_NAME = 'ACER'
|
||||||
|
WINDOWS_MAIN_MEM = WINDOWS_CARD_A_MEM = 'LUMIREAD_600'
|
||||||
|
|
||||||
def upload_cover(self, path, filename, metadata, filepath):
|
def upload_cover(self, path, filename, metadata, filepath):
|
||||||
if metadata.thumbnail and metadata.thumbnail[-1]:
|
if metadata.thumbnail and metadata.thumbnail[-1]:
|
||||||
cfilepath = filepath.replace('/', os.sep)
|
cfilepath = filepath.replace('/', os.sep)
|
||||||
|
@ -91,3 +91,19 @@ class NOOK_COLOR(NOOK):
|
|||||||
|
|
||||||
EBOOK_DIR_MAIN = 'My Files/Books'
|
EBOOK_DIR_MAIN = 'My Files/Books'
|
||||||
|
|
||||||
|
'''
|
||||||
|
def create_upload_path(self, path, mdata, fname, create_dirs=True):
|
||||||
|
filepath = NOOK.create_upload_path(self, path, mdata, fname,
|
||||||
|
create_dirs=create_dirs)
|
||||||
|
edm = self.EBOOK_DIR_MAIN.replace('/', os.sep)
|
||||||
|
npath = os.path.join(edm, _('News')) + os.sep
|
||||||
|
if npath in filepath:
|
||||||
|
filepath = filepath.replace(npath, os.sep.join('My Files',
|
||||||
|
'Magazines')+os.sep)
|
||||||
|
filedir = os.path.dirname(filepath)
|
||||||
|
if create_dirs and not os.path.exists(filedir):
|
||||||
|
os.makedirs(filedir)
|
||||||
|
|
||||||
|
return filepath
|
||||||
|
'''
|
||||||
|
|
||||||
|
@ -76,17 +76,31 @@ class PRS505(USBMS):
|
|||||||
'sending DRMed books in which you cannot change the cover.'
|
'sending DRMed books in which you cannot change the cover.'
|
||||||
' WARNING: This option should only be used with newer '
|
' WARNING: This option should only be used with newer '
|
||||||
'SONY readers: 350, 650, 950 and newer.'),
|
'SONY readers: 350, 650, 950 and newer.'),
|
||||||
|
_('Refresh separate covers when using automatic management (newer readers)') +
|
||||||
|
':::' +
|
||||||
|
_('Set this option to have separate book covers uploaded '
|
||||||
|
'every time you connect your device. Unset this option if '
|
||||||
|
'you have so many books on the reader that performance is '
|
||||||
|
'unacceptable.')
|
||||||
]
|
]
|
||||||
EXTRA_CUSTOMIZATION_DEFAULT = [
|
EXTRA_CUSTOMIZATION_DEFAULT = [
|
||||||
', '.join(['series', 'tags']),
|
', '.join(['series', 'tags']),
|
||||||
|
False,
|
||||||
False
|
False
|
||||||
]
|
]
|
||||||
|
|
||||||
|
OPT_COLLECTIONS = 0
|
||||||
|
OPT_UPLOAD_COVERS = 1
|
||||||
|
OPT_REFRESH_COVERS = 2
|
||||||
|
|
||||||
plugboard = None
|
plugboard = None
|
||||||
plugboard_func = None
|
plugboard_func = None
|
||||||
|
|
||||||
THUMBNAIL_HEIGHT = 200
|
THUMBNAIL_HEIGHT = 200
|
||||||
|
|
||||||
|
MAX_PATH_LEN = 201 # 250 - (max(len(CACHE_THUMBNAIL), len(MEDIA_THUMBNAIL)) +
|
||||||
|
# len('main_thumbnail.jpg') + 1)
|
||||||
|
|
||||||
def windows_filter_pnp_id(self, pnp_id):
|
def windows_filter_pnp_id(self, pnp_id):
|
||||||
return '_LAUNCHER' in pnp_id
|
return '_LAUNCHER' in pnp_id
|
||||||
|
|
||||||
@ -171,7 +185,7 @@ class PRS505(USBMS):
|
|||||||
opts = self.settings()
|
opts = self.settings()
|
||||||
if opts.extra_customization:
|
if opts.extra_customization:
|
||||||
collections = [x.strip() for x in
|
collections = [x.strip() for x in
|
||||||
opts.extra_customization[0].split(',')]
|
opts.extra_customization[self.OPT_COLLECTIONS].split(',')]
|
||||||
else:
|
else:
|
||||||
collections = []
|
collections = []
|
||||||
debug_print('PRS505: collection fields:', collections)
|
debug_print('PRS505: collection fields:', collections)
|
||||||
@ -183,6 +197,23 @@ class PRS505(USBMS):
|
|||||||
c.update(blists, collections, pb)
|
c.update(blists, collections, pb)
|
||||||
c.write()
|
c.write()
|
||||||
|
|
||||||
|
if opts.extra_customization[self.OPT_REFRESH_COVERS]:
|
||||||
|
debug_print('PRS505: uploading covers in sync_booklists')
|
||||||
|
for idx,bl in blists.items():
|
||||||
|
prefix = self._card_a_prefix if idx == 1 else \
|
||||||
|
self._card_b_prefix if idx == 2 \
|
||||||
|
else self._main_prefix
|
||||||
|
for book in bl:
|
||||||
|
try:
|
||||||
|
p = os.path.join(prefix, book.lpath)
|
||||||
|
self._upload_cover(os.path.dirname(p),
|
||||||
|
os.path.splitext(os.path.basename(p))[0],
|
||||||
|
book, p)
|
||||||
|
except:
|
||||||
|
debug_print('FAILED to upload cover', p)
|
||||||
|
else:
|
||||||
|
debug_print('PRS505: NOT uploading covers in sync_booklists')
|
||||||
|
|
||||||
USBMS.sync_booklists(self, booklists, end_session=end_session)
|
USBMS.sync_booklists(self, booklists, end_session=end_session)
|
||||||
debug_print('PRS505: finished sync_booklists')
|
debug_print('PRS505: finished sync_booklists')
|
||||||
|
|
||||||
@ -199,11 +230,17 @@ class PRS505(USBMS):
|
|||||||
|
|
||||||
def upload_cover(self, path, filename, metadata, filepath):
|
def upload_cover(self, path, filename, metadata, filepath):
|
||||||
opts = self.settings()
|
opts = self.settings()
|
||||||
if not opts.extra_customization[1]:
|
if not opts.extra_customization[self.OPT_UPLOAD_COVERS]:
|
||||||
# Building thumbnails disabled
|
# Building thumbnails disabled
|
||||||
debug_print('PRS505: not uploading covers')
|
debug_print('PRS505: not uploading cover')
|
||||||
return
|
return
|
||||||
debug_print('PRS505: uploading covers')
|
debug_print('PRS505: uploading cover')
|
||||||
|
try:
|
||||||
|
self._upload_cover(path, filename, metadata, filepath)
|
||||||
|
except:
|
||||||
|
debug_print('FAILED to upload cover', filepath)
|
||||||
|
|
||||||
|
def _upload_cover(self, path, filename, metadata, filepath):
|
||||||
if metadata.thumbnail and metadata.thumbnail[-1]:
|
if metadata.thumbnail and metadata.thumbnail[-1]:
|
||||||
path = path.replace('/', os.sep)
|
path = path.replace('/', os.sep)
|
||||||
is_main = path.startswith(self._main_prefix)
|
is_main = path.startswith(self._main_prefix)
|
||||||
|
@ -98,6 +98,9 @@ class Device(DeviceConfig, DevicePlugin):
|
|||||||
# copy these back to the library
|
# copy these back to the library
|
||||||
BACKLOADING_ERROR_MESSAGE = None
|
BACKLOADING_ERROR_MESSAGE = None
|
||||||
|
|
||||||
|
#: The maximum length of paths created on the device
|
||||||
|
MAX_PATH_LEN = 250
|
||||||
|
|
||||||
def reset(self, key='-1', log_packets=False, report_progress=None,
|
def reset(self, key='-1', log_packets=False, report_progress=None,
|
||||||
detected_device=None):
|
detected_device=None):
|
||||||
self._main_prefix = self._card_a_prefix = self._card_b_prefix = None
|
self._main_prefix = self._card_a_prefix = self._card_b_prefix = None
|
||||||
@ -875,7 +878,7 @@ class Device(DeviceConfig, DevicePlugin):
|
|||||||
|
|
||||||
def create_upload_path(self, path, mdata, fname, create_dirs=True):
|
def create_upload_path(self, path, mdata, fname, create_dirs=True):
|
||||||
path = os.path.abspath(path)
|
path = os.path.abspath(path)
|
||||||
extra_components = []
|
maxlen = self.MAX_PATH_LEN
|
||||||
|
|
||||||
special_tag = None
|
special_tag = None
|
||||||
if mdata.tags:
|
if mdata.tags:
|
||||||
@ -902,7 +905,7 @@ class Device(DeviceConfig, DevicePlugin):
|
|||||||
app_id = str(getattr(mdata, 'application_id', ''))
|
app_id = str(getattr(mdata, 'application_id', ''))
|
||||||
# The db id will be in the created filename
|
# The db id will be in the created filename
|
||||||
extra_components = get_components(template, mdata, fname,
|
extra_components = get_components(template, mdata, fname,
|
||||||
timefmt=opts.send_timefmt, length=250-len(app_id)-1)
|
timefmt=opts.send_timefmt, length=maxlen-len(app_id)-1)
|
||||||
if not extra_components:
|
if not extra_components:
|
||||||
extra_components.append(sanitize(self.filename_callback(fname,
|
extra_components.append(sanitize(self.filename_callback(fname,
|
||||||
mdata)))
|
mdata)))
|
||||||
@ -937,12 +940,11 @@ class Device(DeviceConfig, DevicePlugin):
|
|||||||
return ans
|
return ans
|
||||||
|
|
||||||
extra_components = list(map(remove_trailing_periods, extra_components))
|
extra_components = list(map(remove_trailing_periods, extra_components))
|
||||||
components = shorten_components_to(250 - len(path), extra_components)
|
components = shorten_components_to(maxlen - len(path), extra_components)
|
||||||
components = self.sanitize_path_components(components)
|
components = self.sanitize_path_components(components)
|
||||||
filepath = os.path.join(path, *components)
|
filepath = os.path.join(path, *components)
|
||||||
filedir = os.path.dirname(filepath)
|
filedir = os.path.dirname(filepath)
|
||||||
|
|
||||||
|
|
||||||
if create_dirs and not os.path.exists(filedir):
|
if create_dirs and not os.path.exists(filedir):
|
||||||
os.makedirs(filedir)
|
os.makedirs(filedir)
|
||||||
|
|
||||||
|
@ -18,7 +18,7 @@
|
|||||||
|
|
||||||
__version__ = "1.0"
|
__version__ = "1.0"
|
||||||
|
|
||||||
import re
|
import re, codecs
|
||||||
|
|
||||||
def detect(aBuf):
|
def detect(aBuf):
|
||||||
import calibre.ebooks.chardet.universaldetector as universaldetector
|
import calibre.ebooks.chardet.universaldetector as universaldetector
|
||||||
@ -83,9 +83,11 @@ def xml_to_unicode(raw, verbose=False, strip_encoding_pats=False,
|
|||||||
if not raw:
|
if not raw:
|
||||||
return u'', encoding
|
return u'', encoding
|
||||||
if not isinstance(raw, unicode):
|
if not isinstance(raw, unicode):
|
||||||
if raw.startswith('\xff\xfe'):
|
if raw.startswith(codecs.BOM_UTF8):
|
||||||
|
raw, encoding = raw.decode('utf-8')[1:], 'utf-8'
|
||||||
|
elif raw.startswith(codecs.BOM_UTF16_LE):
|
||||||
raw, encoding = raw.decode('utf-16-le')[1:], 'utf-16-le'
|
raw, encoding = raw.decode('utf-16-le')[1:], 'utf-16-le'
|
||||||
elif raw.startswith('\xfe\xff'):
|
elif raw.startswith(codecs.BOM_UTF16_BE):
|
||||||
raw, encoding = raw.decode('utf-16-be')[1:], 'utf-16-be'
|
raw, encoding = raw.decode('utf-16-be')[1:], 'utf-16-be'
|
||||||
if not isinstance(raw, unicode):
|
if not isinstance(raw, unicode):
|
||||||
for pat in ENCODING_PATS:
|
for pat in ENCODING_PATS:
|
||||||
|
@ -75,7 +75,7 @@ class CHMInput(InputFormatPlugin):
|
|||||||
def _create_oebbook(self, hhcpath, basedir, opts, log, mi):
|
def _create_oebbook(self, hhcpath, basedir, opts, log, mi):
|
||||||
from calibre.ebooks.conversion.plumber import create_oebbook
|
from calibre.ebooks.conversion.plumber import create_oebbook
|
||||||
from calibre.ebooks.oeb.base import DirContainer
|
from calibre.ebooks.oeb.base import DirContainer
|
||||||
oeb = create_oebbook(log, None, opts, self,
|
oeb = create_oebbook(log, None, opts,
|
||||||
encoding=opts.input_encoding, populate=False)
|
encoding=opts.input_encoding, populate=False)
|
||||||
self.oeb = oeb
|
self.oeb = oeb
|
||||||
|
|
||||||
|
@ -42,6 +42,12 @@ option.
|
|||||||
For full documentation of the conversion system see
|
For full documentation of the conversion system see
|
||||||
''') + 'http://calibre-ebook.com/user_manual/conversion.html'
|
''') + 'http://calibre-ebook.com/user_manual/conversion.html'
|
||||||
|
|
||||||
|
HEURISTIC_OPTIONS = ['markup_chapter_headings',
|
||||||
|
'italicize_common_cases', 'fix_indents',
|
||||||
|
'html_unwrap_factor', 'unwrap_lines',
|
||||||
|
'delete_blank_paragraphs', 'format_scene_breaks',
|
||||||
|
'dehyphenate', 'renumber_headings']
|
||||||
|
|
||||||
def print_help(parser, log):
|
def print_help(parser, log):
|
||||||
help = parser.format_help().encode(preferred_encoding, 'replace')
|
help = parser.format_help().encode(preferred_encoding, 'replace')
|
||||||
log(help)
|
log(help)
|
||||||
@ -83,6 +89,8 @@ def option_recommendation_to_cli_option(add_option, rec):
|
|||||||
if opt.long_switch == 'verbose':
|
if opt.long_switch == 'verbose':
|
||||||
attrs['action'] = 'count'
|
attrs['action'] = 'count'
|
||||||
attrs.pop('type', '')
|
attrs.pop('type', '')
|
||||||
|
if opt.name in HEURISTIC_OPTIONS and rec.recommended_value is True:
|
||||||
|
switches = ['--disable-'+opt.long_switch]
|
||||||
add_option(Option(*switches, **attrs))
|
add_option(Option(*switches, **attrs))
|
||||||
|
|
||||||
def add_input_output_options(parser, plumber):
|
def add_input_output_options(parser, plumber):
|
||||||
@ -126,8 +134,24 @@ def add_pipeline_options(parser, plumber):
|
|||||||
'margin_top', 'margin_left', 'margin_right',
|
'margin_top', 'margin_left', 'margin_right',
|
||||||
'margin_bottom', 'change_justification',
|
'margin_bottom', 'change_justification',
|
||||||
'insert_blank_line', 'remove_paragraph_spacing','remove_paragraph_spacing_indent_size',
|
'insert_blank_line', 'remove_paragraph_spacing','remove_paragraph_spacing_indent_size',
|
||||||
'asciiize', 'remove_header', 'header_regex',
|
'asciiize',
|
||||||
'remove_footer', 'footer_regex',
|
]
|
||||||
|
),
|
||||||
|
|
||||||
|
'HEURISTIC PROCESSING' : (
|
||||||
|
_('Modify the document text and structure using common'
|
||||||
|
' patterns. Disabled by default. Use %s to enable. '
|
||||||
|
' Individual actions can be disabled with the %s options.')
|
||||||
|
% ('--enable-heuristics', '--disable-*'),
|
||||||
|
['enable_heuristics'] + HEURISTIC_OPTIONS
|
||||||
|
),
|
||||||
|
|
||||||
|
'SEARCH AND REPLACE' : (
|
||||||
|
_('Modify the document text and structure using user defined patterns.'),
|
||||||
|
[
|
||||||
|
'sr1_search', 'sr1_replace',
|
||||||
|
'sr2_search', 'sr2_replace',
|
||||||
|
'sr3_search', 'sr3_replace',
|
||||||
]
|
]
|
||||||
),
|
),
|
||||||
|
|
||||||
@ -137,7 +161,6 @@ def add_pipeline_options(parser, plumber):
|
|||||||
'chapter', 'chapter_mark',
|
'chapter', 'chapter_mark',
|
||||||
'prefer_metadata_cover', 'remove_first_image',
|
'prefer_metadata_cover', 'remove_first_image',
|
||||||
'insert_metadata', 'page_breaks_before',
|
'insert_metadata', 'page_breaks_before',
|
||||||
'preprocess_html', 'html_unwrap_factor',
|
|
||||||
]
|
]
|
||||||
),
|
),
|
||||||
|
|
||||||
@ -164,7 +187,8 @@ def add_pipeline_options(parser, plumber):
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
group_order = ['', 'LOOK AND FEEL', 'STRUCTURE DETECTION',
|
group_order = ['', 'LOOK AND FEEL', 'HEURISTIC PROCESSING',
|
||||||
|
'SEARCH AND REPLACE', 'STRUCTURE DETECTION',
|
||||||
'TABLE OF CONTENTS', 'METADATA', 'DEBUG']
|
'TABLE OF CONTENTS', 'METADATA', 'DEBUG']
|
||||||
|
|
||||||
for group in group_order:
|
for group in group_order:
|
||||||
|
@ -72,7 +72,8 @@ class Plumber(object):
|
|||||||
]
|
]
|
||||||
|
|
||||||
def __init__(self, input, output, log, report_progress=DummyReporter(),
|
def __init__(self, input, output, log, report_progress=DummyReporter(),
|
||||||
dummy=False, merge_plugin_recs=True, abort_after_input_dump=False):
|
dummy=False, merge_plugin_recs=True, abort_after_input_dump=False,
|
||||||
|
override_input_metadata=False):
|
||||||
'''
|
'''
|
||||||
:param input: Path to input file.
|
:param input: Path to input file.
|
||||||
:param output: Path to output file/directory
|
:param output: Path to output file/directory
|
||||||
@ -87,7 +88,9 @@ class Plumber(object):
|
|||||||
self.log = log
|
self.log = log
|
||||||
self.ui_reporter = report_progress
|
self.ui_reporter = report_progress
|
||||||
self.abort_after_input_dump = abort_after_input_dump
|
self.abort_after_input_dump = abort_after_input_dump
|
||||||
|
self.override_input_metadata = override_input_metadata
|
||||||
|
|
||||||
|
# Pipeline options {{{
|
||||||
# Initialize the conversion options that are independent of input and
|
# Initialize the conversion options that are independent of input and
|
||||||
# output formats. The input and output plugins can still disable these
|
# output formats. The input and output plugins can still disable these
|
||||||
# options via recommendations.
|
# options via recommendations.
|
||||||
@ -375,23 +378,6 @@ OptionRecommendation(name='insert_metadata',
|
|||||||
)
|
)
|
||||||
),
|
),
|
||||||
|
|
||||||
OptionRecommendation(name='preprocess_html',
|
|
||||||
recommended_value=False, level=OptionRecommendation.LOW,
|
|
||||||
help=_('Attempt to detect and correct hard line breaks and other '
|
|
||||||
'problems in the source file. This may make things worse, so use '
|
|
||||||
'with care.'
|
|
||||||
)
|
|
||||||
),
|
|
||||||
|
|
||||||
OptionRecommendation(name='html_unwrap_factor',
|
|
||||||
recommended_value=0.40, level=OptionRecommendation.LOW,
|
|
||||||
help=_('Scale used to determine the length at which a line should '
|
|
||||||
'be unwrapped if preprocess is enabled. Valid values are a decimal between 0 and 1. The '
|
|
||||||
'default is 0.40, just below the median line length. This will unwrap typical books '
|
|
||||||
' with hard line breaks, but should be reduced if the line length is variable.'
|
|
||||||
)
|
|
||||||
),
|
|
||||||
|
|
||||||
OptionRecommendation(name='smarten_punctuation',
|
OptionRecommendation(name='smarten_punctuation',
|
||||||
recommended_value=False, level=OptionRecommendation.LOW,
|
recommended_value=False, level=OptionRecommendation.LOW,
|
||||||
help=_('Convert plain quotes, dashes and ellipsis to their '
|
help=_('Convert plain quotes, dashes and ellipsis to their '
|
||||||
@ -400,32 +386,6 @@ OptionRecommendation(name='smarten_punctuation',
|
|||||||
)
|
)
|
||||||
),
|
),
|
||||||
|
|
||||||
OptionRecommendation(name='remove_header',
|
|
||||||
recommended_value=False, level=OptionRecommendation.LOW,
|
|
||||||
help=_('Use a regular expression to try and remove the header.'
|
|
||||||
)
|
|
||||||
),
|
|
||||||
|
|
||||||
OptionRecommendation(name='header_regex',
|
|
||||||
recommended_value='(?i)(?<=<hr>)((\s*<a name=\d+></a>((<img.+?>)*<br>\s*)?\d+<br>\s*.*?\s*)|(\s*<a name=\d+></a>((<img.+?>)*<br>\s*)?.*?<br>\s*\d+))(?=<br>)',
|
|
||||||
level=OptionRecommendation.LOW,
|
|
||||||
help=_('The regular expression to use to remove the header.'
|
|
||||||
)
|
|
||||||
),
|
|
||||||
|
|
||||||
OptionRecommendation(name='remove_footer',
|
|
||||||
recommended_value=False, level=OptionRecommendation.LOW,
|
|
||||||
help=_('Use a regular expression to try and remove the footer.'
|
|
||||||
)
|
|
||||||
),
|
|
||||||
|
|
||||||
OptionRecommendation(name='footer_regex',
|
|
||||||
recommended_value='(?i)(?<=<hr>)((\s*<a name=\d+></a>((<img.+?>)*<br>\s*)?\d+<br>\s*.*?\s*)|(\s*<a name=\d+></a>((<img.+?>)*<br>\s*)?.*?<br>\s*\d+))(?=<br>)',
|
|
||||||
level=OptionRecommendation.LOW,
|
|
||||||
help=_('The regular expression to use to remove the footer.'
|
|
||||||
)
|
|
||||||
),
|
|
||||||
|
|
||||||
OptionRecommendation(name='read_metadata_from_opf',
|
OptionRecommendation(name='read_metadata_from_opf',
|
||||||
recommended_value=None, level=OptionRecommendation.LOW,
|
recommended_value=None, level=OptionRecommendation.LOW,
|
||||||
short_switch='m',
|
short_switch='m',
|
||||||
@ -526,7 +486,91 @@ OptionRecommendation(name='timestamp',
|
|||||||
recommended_value=None, level=OptionRecommendation.LOW,
|
recommended_value=None, level=OptionRecommendation.LOW,
|
||||||
help=_('Set the book timestamp (used by the date column in calibre).')),
|
help=_('Set the book timestamp (used by the date column in calibre).')),
|
||||||
|
|
||||||
|
OptionRecommendation(name='enable_heuristics',
|
||||||
|
recommended_value=False, level=OptionRecommendation.LOW,
|
||||||
|
help=_('Enable heuristic processing. This option must be set for any '
|
||||||
|
'heuristic processing to take place.')),
|
||||||
|
|
||||||
|
OptionRecommendation(name='markup_chapter_headings',
|
||||||
|
recommended_value=True, level=OptionRecommendation.LOW,
|
||||||
|
help=_('Detect unformatted chapter headings and sub headings. Change '
|
||||||
|
'them to h2 and h3 tags. This setting will not create a TOC, '
|
||||||
|
'but can be used in conjunction with structure detection to create '
|
||||||
|
'one.')),
|
||||||
|
|
||||||
|
OptionRecommendation(name='italicize_common_cases',
|
||||||
|
recommended_value=True, level=OptionRecommendation.LOW,
|
||||||
|
help=_('Look for common words and patterns that denote '
|
||||||
|
'italics and italicize them.')),
|
||||||
|
|
||||||
|
OptionRecommendation(name='fix_indents',
|
||||||
|
recommended_value=True, level=OptionRecommendation.LOW,
|
||||||
|
help=_('Turn indentation created from multiple non-breaking space entities '
|
||||||
|
'into CSS indents.')),
|
||||||
|
|
||||||
|
OptionRecommendation(name='html_unwrap_factor',
|
||||||
|
recommended_value=0.40, level=OptionRecommendation.LOW,
|
||||||
|
help=_('Scale used to determine the length at which a line should '
|
||||||
|
'be unwrapped. Valid values are a decimal between 0 and 1. The '
|
||||||
|
'default is 0.4, just below the median line length. If only a '
|
||||||
|
'few lines in the document require unwrapping this value should '
|
||||||
|
'be reduced')),
|
||||||
|
|
||||||
|
OptionRecommendation(name='unwrap_lines',
|
||||||
|
recommended_value=True, level=OptionRecommendation.LOW,
|
||||||
|
help=_('Unwrap lines using punctuation and other formatting clues.')),
|
||||||
|
|
||||||
|
OptionRecommendation(name='delete_blank_paragraphs',
|
||||||
|
recommended_value=True, level=OptionRecommendation.LOW,
|
||||||
|
help=_('Remove empty paragraphs from the document when they exist between '
|
||||||
|
'every other paragraph')),
|
||||||
|
|
||||||
|
OptionRecommendation(name='format_scene_breaks',
|
||||||
|
recommended_value=True, level=OptionRecommendation.LOW,
|
||||||
|
help=_('Left aligned scene break markers are center aligned. '
|
||||||
|
'Replace soft scene breaks that use multiple blank lines with'
|
||||||
|
'horizontal rules.')),
|
||||||
|
|
||||||
|
OptionRecommendation(name='dehyphenate',
|
||||||
|
recommended_value=True, level=OptionRecommendation.LOW,
|
||||||
|
help=_('Analyze hyphenated words throughout the document. The '
|
||||||
|
'document itself is used as a dictionary to determine whether hyphens '
|
||||||
|
'should be retained or removed.')),
|
||||||
|
|
||||||
|
OptionRecommendation(name='renumber_headings',
|
||||||
|
recommended_value=True, level=OptionRecommendation.LOW,
|
||||||
|
help=_('Looks for occurrences of sequential <h1> or <h2> tags. '
|
||||||
|
'The tags are renumbered to prevent splitting in the middle '
|
||||||
|
'of chapter headings.')),
|
||||||
|
|
||||||
|
OptionRecommendation(name='sr1_search',
|
||||||
|
recommended_value='', level=OptionRecommendation.LOW,
|
||||||
|
help=_('Search pattern (regular expression) to be replaced with '
|
||||||
|
'sr1-replace.')),
|
||||||
|
|
||||||
|
OptionRecommendation(name='sr1_replace',
|
||||||
|
recommended_value='', level=OptionRecommendation.LOW,
|
||||||
|
help=_('Replacement to replace the text found with sr1-search.')),
|
||||||
|
|
||||||
|
OptionRecommendation(name='sr2_search',
|
||||||
|
recommended_value='', level=OptionRecommendation.LOW,
|
||||||
|
help=_('Search pattern (regular expression) to be replaced with '
|
||||||
|
'sr2-replace.')),
|
||||||
|
|
||||||
|
OptionRecommendation(name='sr2_replace',
|
||||||
|
recommended_value='', level=OptionRecommendation.LOW,
|
||||||
|
help=_('Replacement to replace the text found with sr2-search.')),
|
||||||
|
|
||||||
|
OptionRecommendation(name='sr3_search',
|
||||||
|
recommended_value='', level=OptionRecommendation.LOW,
|
||||||
|
help=_('Search pattern (regular expression) to be replaced with '
|
||||||
|
'sr3-replace.')),
|
||||||
|
|
||||||
|
OptionRecommendation(name='sr3_replace',
|
||||||
|
recommended_value='', level=OptionRecommendation.LOW,
|
||||||
|
help=_('Replacement to replace the text found with sr3-search.')),
|
||||||
]
|
]
|
||||||
|
# }}}
|
||||||
|
|
||||||
input_fmt = os.path.splitext(self.input)[1]
|
input_fmt = os.path.splitext(self.input)[1]
|
||||||
if not input_fmt:
|
if not input_fmt:
|
||||||
@ -859,7 +903,6 @@ OptionRecommendation(name='timestamp',
|
|||||||
self.opts_to_mi(self.user_metadata)
|
self.opts_to_mi(self.user_metadata)
|
||||||
if not hasattr(self.oeb, 'manifest'):
|
if not hasattr(self.oeb, 'manifest'):
|
||||||
self.oeb = create_oebbook(self.log, self.oeb, self.opts,
|
self.oeb = create_oebbook(self.log, self.oeb, self.opts,
|
||||||
self.input_plugin,
|
|
||||||
encoding=self.input_plugin.output_encoding)
|
encoding=self.input_plugin.output_encoding)
|
||||||
self.input_plugin.postprocess_book(self.oeb, self.opts, self.log)
|
self.input_plugin.postprocess_book(self.oeb, self.opts, self.log)
|
||||||
self.opts.is_image_collection = self.input_plugin.is_image_collection
|
self.opts.is_image_collection = self.input_plugin.is_image_collection
|
||||||
@ -883,7 +926,8 @@ OptionRecommendation(name='timestamp',
|
|||||||
self.opts.dest = self.opts.output_profile
|
self.opts.dest = self.opts.output_profile
|
||||||
|
|
||||||
from calibre.ebooks.oeb.transforms.metadata import MergeMetadata
|
from calibre.ebooks.oeb.transforms.metadata import MergeMetadata
|
||||||
MergeMetadata()(self.oeb, self.user_metadata, self.opts)
|
MergeMetadata()(self.oeb, self.user_metadata, self.opts,
|
||||||
|
override_input_metadata=self.override_input_metadata)
|
||||||
pr(0.2)
|
pr(0.2)
|
||||||
self.flush()
|
self.flush()
|
||||||
|
|
||||||
@ -969,14 +1013,15 @@ OptionRecommendation(name='timestamp',
|
|||||||
self.log(self.output_fmt.upper(), 'output written to', self.output)
|
self.log(self.output_fmt.upper(), 'output written to', self.output)
|
||||||
self.flush()
|
self.flush()
|
||||||
|
|
||||||
def create_oebbook(log, path_or_stream, opts, input_plugin, reader=None,
|
def create_oebbook(log, path_or_stream, opts, reader=None,
|
||||||
encoding='utf-8', populate=True):
|
encoding='utf-8', populate=True):
|
||||||
'''
|
'''
|
||||||
Create an OEBBook.
|
Create an OEBBook.
|
||||||
'''
|
'''
|
||||||
from calibre.ebooks.oeb.base import OEBBook
|
from calibre.ebooks.oeb.base import OEBBook
|
||||||
html_preprocessor = HTMLPreProcessor(input_plugin.preprocess_html,
|
html_preprocessor = HTMLPreProcessor(log, opts)
|
||||||
opts.preprocess_html, opts)
|
if not encoding:
|
||||||
|
encoding = None
|
||||||
oeb = OEBBook(log, html_preprocessor,
|
oeb = OEBBook(log, html_preprocessor,
|
||||||
pretty_print=opts.pretty_print, input_encoding=encoding)
|
pretty_print=opts.pretty_print, input_encoding=encoding)
|
||||||
if not populate:
|
if not populate:
|
||||||
|
@ -7,7 +7,7 @@ __docformat__ = 'restructuredtext en'
|
|||||||
|
|
||||||
import functools, re
|
import functools, re
|
||||||
|
|
||||||
from calibre import entity_to_unicode
|
from calibre import entity_to_unicode, as_unicode
|
||||||
|
|
||||||
XMLDECL_RE = re.compile(r'^\s*<[?]xml.*?[?]>')
|
XMLDECL_RE = re.compile(r'^\s*<[?]xml.*?[?]>')
|
||||||
SVG_NS = 'http://www.w3.org/2000/svg'
|
SVG_NS = 'http://www.w3.org/2000/svg'
|
||||||
@ -78,6 +78,8 @@ class DocAnalysis(object):
|
|||||||
linere = re.compile('(?<=<br>)(?!\s*<br>).*?(?=<br>)', re.DOTALL)
|
linere = re.compile('(?<=<br>)(?!\s*<br>).*?(?=<br>)', re.DOTALL)
|
||||||
elif format == 'spanned_html':
|
elif format == 'spanned_html':
|
||||||
linere = re.compile('(?<=<span).*?(?=</span>)', re.DOTALL)
|
linere = re.compile('(?<=<span).*?(?=</span>)', re.DOTALL)
|
||||||
|
elif format == 'txt':
|
||||||
|
linere = re.compile('.*?\n')
|
||||||
self.lines = linere.findall(raw)
|
self.lines = linere.findall(raw)
|
||||||
|
|
||||||
def line_length(self, percent):
|
def line_length(self, percent):
|
||||||
@ -172,13 +174,19 @@ class Dehyphenator(object):
|
|||||||
retain hyphens.
|
retain hyphens.
|
||||||
'''
|
'''
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self, verbose=0, log=None):
|
||||||
|
self.log = log
|
||||||
|
self.verbose = verbose
|
||||||
# Add common suffixes to the regex below to increase the likelihood of a match -
|
# Add common suffixes to the regex below to increase the likelihood of a match -
|
||||||
# don't add suffixes which are also complete words, such as 'able' or 'sex'
|
# don't add suffixes which are also complete words, such as 'able' or 'sex'
|
||||||
self.removesuffixes = re.compile(r"((ed)?ly|('e)?s|a?(t|s)?ion(s|al(ly)?)?|ings?|er|(i)?ous|(i|a)ty|(it)?ies|ive|gence|istic(ally)?|(e|a)nce|ment(s)?|ism|ated|(e|u)ct(ed)?|ed|(i|ed)?ness|(e|a)ncy|ble|ier|al|ex)$", re.IGNORECASE)
|
# only remove if it's not already the point of hyphenation
|
||||||
|
self.suffix_string = "((ed)?ly|'?e?s||a?(t|s)?ion(s|al(ly)?)?|ings?|er|(i)?ous|(i|a)ty|(it)?ies|ive|gence|istic(ally)?|(e|a)nce|m?ents?|ism|ated|(e|u)ct(ed)?|ed|(i|ed)?ness|(e|a)ncy|ble|ier|al|ex|ian)$"
|
||||||
|
self.suffixes = re.compile(r"^%s" % self.suffix_string, re.IGNORECASE)
|
||||||
|
self.removesuffixes = re.compile(r"%s" % self.suffix_string, re.IGNORECASE)
|
||||||
# remove prefixes if the prefix was not already the point of hyphenation
|
# remove prefixes if the prefix was not already the point of hyphenation
|
||||||
self.prefixes = re.compile(r'^(dis|re|un|in|ex)$', re.IGNORECASE)
|
self.prefix_string = '^(dis|re|un|in|ex)'
|
||||||
self.removeprefix = re.compile(r'^(dis|re|un|in|ex)', re.IGNORECASE)
|
self.prefixes = re.compile(r'%s$' % self.prefix_string, re.IGNORECASE)
|
||||||
|
self.removeprefix = re.compile(r'%s' % self.prefix_string, re.IGNORECASE)
|
||||||
|
|
||||||
def dehyphenate(self, match):
|
def dehyphenate(self, match):
|
||||||
firsthalf = match.group('firstpart')
|
firsthalf = match.group('firstpart')
|
||||||
@ -189,31 +197,48 @@ class Dehyphenator(object):
|
|||||||
wraptags = ''
|
wraptags = ''
|
||||||
hyphenated = unicode(firsthalf) + "-" + unicode(secondhalf)
|
hyphenated = unicode(firsthalf) + "-" + unicode(secondhalf)
|
||||||
dehyphenated = unicode(firsthalf) + unicode(secondhalf)
|
dehyphenated = unicode(firsthalf) + unicode(secondhalf)
|
||||||
|
if self.suffixes.match(secondhalf) is None:
|
||||||
lookupword = self.removesuffixes.sub('', dehyphenated)
|
lookupword = self.removesuffixes.sub('', dehyphenated)
|
||||||
if self.prefixes.match(firsthalf) is None:
|
else:
|
||||||
|
lookupword = dehyphenated
|
||||||
|
if len(firsthalf) > 4 and self.prefixes.match(firsthalf) is None:
|
||||||
lookupword = self.removeprefix.sub('', lookupword)
|
lookupword = self.removeprefix.sub('', lookupword)
|
||||||
#print "lookup word is: "+str(lookupword)+", orig is: " + str(hyphenated)
|
if self.verbose > 2:
|
||||||
|
self.log("lookup word is: "+str(lookupword)+", orig is: " + str(hyphenated))
|
||||||
try:
|
try:
|
||||||
searchresult = self.html.find(lookupword.lower())
|
searchresult = self.html.find(lookupword.lower())
|
||||||
except:
|
except:
|
||||||
return hyphenated
|
return hyphenated
|
||||||
if self.format == 'html_cleanup':
|
if self.format == 'html_cleanup' or self.format == 'txt_cleanup':
|
||||||
if self.html.find(lookupword) != -1 or searchresult != -1:
|
if self.html.find(lookupword) != -1 or searchresult != -1:
|
||||||
#print "Cleanup:returned dehyphenated word: " + str(dehyphenated)
|
if self.verbose > 2:
|
||||||
|
self.log(" Cleanup:returned dehyphenated word: " + str(dehyphenated))
|
||||||
return dehyphenated
|
return dehyphenated
|
||||||
elif self.html.find(hyphenated) != -1:
|
elif self.html.find(hyphenated) != -1:
|
||||||
#print "Cleanup:returned hyphenated word: " + str(hyphenated)
|
if self.verbose > 2:
|
||||||
|
self.log(" Cleanup:returned hyphenated word: " + str(hyphenated))
|
||||||
return hyphenated
|
return hyphenated
|
||||||
else:
|
else:
|
||||||
#print "Cleanup:returning original text "+str(firsthalf)+" + linefeed "+str(secondhalf)
|
if self.verbose > 2:
|
||||||
|
self.log(" Cleanup:returning original text "+str(firsthalf)+" + linefeed "+str(secondhalf))
|
||||||
return firsthalf+u'\u2014'+wraptags+secondhalf
|
return firsthalf+u'\u2014'+wraptags+secondhalf
|
||||||
|
|
||||||
else:
|
else:
|
||||||
|
if self.format == 'individual_words' and len(firsthalf) + len(secondhalf) <= 6:
|
||||||
|
if self.verbose > 2:
|
||||||
|
self.log("too short, returned hyphenated word: " + str(hyphenated))
|
||||||
|
return hyphenated
|
||||||
|
if len(firsthalf) <= 2 and len(secondhalf) <= 2:
|
||||||
|
if self.verbose > 2:
|
||||||
|
self.log("too short, returned hyphenated word: " + str(hyphenated))
|
||||||
|
return hyphenated
|
||||||
if self.html.find(lookupword) != -1 or searchresult != -1:
|
if self.html.find(lookupword) != -1 or searchresult != -1:
|
||||||
#print "returned dehyphenated word: " + str(dehyphenated)
|
if self.verbose > 2:
|
||||||
|
self.log(" returned dehyphenated word: " + str(dehyphenated))
|
||||||
return dehyphenated
|
return dehyphenated
|
||||||
else:
|
else:
|
||||||
#print " returned hyphenated word: " + str(hyphenated)
|
if self.verbose > 2:
|
||||||
|
self.log(" returned hyphenated word: " + str(hyphenated))
|
||||||
return hyphenated
|
return hyphenated
|
||||||
|
|
||||||
def __call__(self, html, format, length=1):
|
def __call__(self, html, format, length=1):
|
||||||
@ -223,10 +248,15 @@ class Dehyphenator(object):
|
|||||||
intextmatch = re.compile(u'(?<=.{%i})(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)\s*(?=<)(?P<wraptags>(</span>)?\s*(</[iubp]>\s*){1,2}(?P<up2threeblanks><(p|div)[^>]*>\s*(<p[^>]*>\s*</p>\s*)?</(p|div)>\s+){0,3}\s*(<[iubp][^>]*>\s*){1,2}(<span[^>]*>)?)\s*(?P<secondpart>[\w\d]+)' % length)
|
intextmatch = re.compile(u'(?<=.{%i})(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)\s*(?=<)(?P<wraptags>(</span>)?\s*(</[iubp]>\s*){1,2}(?P<up2threeblanks><(p|div)[^>]*>\s*(<p[^>]*>\s*</p>\s*)?</(p|div)>\s+){0,3}\s*(<[iubp][^>]*>\s*){1,2}(<span[^>]*>)?)\s*(?P<secondpart>[\w\d]+)' % length)
|
||||||
elif format == 'pdf':
|
elif format == 'pdf':
|
||||||
intextmatch = re.compile(u'(?<=.{%i})(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)\s*(?P<wraptags><p>|</[iub]>\s*<p>\s*<[iub]>)\s*(?P<secondpart>[\w\d]+)'% length)
|
intextmatch = re.compile(u'(?<=.{%i})(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)\s*(?P<wraptags><p>|</[iub]>\s*<p>\s*<[iub]>)\s*(?P<secondpart>[\w\d]+)'% length)
|
||||||
|
elif format == 'txt':
|
||||||
|
intextmatch = re.compile(u'(?<=.{%i})(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)(\u0020|\u0009)*(?P<wraptags>(\n(\u0020|\u0009)*)+)(?P<secondpart>[\w\d]+)'% length)
|
||||||
elif format == 'individual_words':
|
elif format == 'individual_words':
|
||||||
intextmatch = re.compile(u'>[^<]*\b(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)"\s>]+)(-|‐)(?P<secondpart)\w+)\b[^<]*<') # for later, not called anywhere yet
|
intextmatch = re.compile(u'(?!<)(?P<firstpart>\w+)(-|‐)\s*(?P<secondpart>\w+)(?![^<]*?>)')
|
||||||
elif format == 'html_cleanup':
|
elif format == 'html_cleanup':
|
||||||
intextmatch = re.compile(u'(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)\s*(?=<)(?P<wraptags></span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?P<secondpart>[\w\d]+)')
|
intextmatch = re.compile(u'(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)\s*(?=<)(?P<wraptags></span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?P<secondpart>[\w\d]+)')
|
||||||
|
elif format == 'txt_cleanup':
|
||||||
|
intextmatch = re.compile(u'(?P<firstpart>\w+)(-|‐)(?P<wraptags>\s+)(?P<secondpart>[\w\d]+)')
|
||||||
|
|
||||||
|
|
||||||
html = intextmatch.sub(self.dehyphenate, html)
|
html = intextmatch.sub(self.dehyphenate, html)
|
||||||
return html
|
return html
|
||||||
@ -353,7 +383,7 @@ class HTMLPreProcessor(object):
|
|||||||
(re.compile(r'((?<=</a>)\s*file:////?[A-Z].*<br>|file:////?[A-Z].*<br>(?=\s*<hr>))', re.IGNORECASE), lambda match: ''),
|
(re.compile(r'((?<=</a>)\s*file:////?[A-Z].*<br>|file:////?[A-Z].*<br>(?=\s*<hr>))', re.IGNORECASE), lambda match: ''),
|
||||||
|
|
||||||
# Center separator lines
|
# Center separator lines
|
||||||
(re.compile(u'<br>\s*(?P<break>([*#•✦]+\s*)+)\s*<br>'), lambda match: '<p>\n<p style="text-align:center">' + match.group(1) + '</p>'),
|
(re.compile(u'<br>\s*(?P<break>([*#•✦=]+\s*)+)\s*<br>'), lambda match: '<p>\n<p style="text-align:center">' + match.group(1) + '</p>'),
|
||||||
|
|
||||||
# Remove page links
|
# Remove page links
|
||||||
(re.compile(r'<a name=\d+></a>', re.IGNORECASE), lambda match: ''),
|
(re.compile(r'<a name=\d+></a>', re.IGNORECASE), lambda match: ''),
|
||||||
@ -390,10 +420,8 @@ class HTMLPreProcessor(object):
|
|||||||
(re.compile('<span[^><]*?id=subtitle[^><]*?>(.*?)</span>', re.IGNORECASE|re.DOTALL),
|
(re.compile('<span[^><]*?id=subtitle[^><]*?>(.*?)</span>', re.IGNORECASE|re.DOTALL),
|
||||||
lambda match : '<h3 class="subtitle">%s</h3>'%(match.group(1),)),
|
lambda match : '<h3 class="subtitle">%s</h3>'%(match.group(1),)),
|
||||||
]
|
]
|
||||||
def __init__(self, input_plugin_preprocess, plugin_preprocess,
|
def __init__(self, log=None, extra_opts=None):
|
||||||
extra_opts=None):
|
self.log = log
|
||||||
self.input_plugin_preprocess = input_plugin_preprocess
|
|
||||||
self.plugin_preprocess = plugin_preprocess
|
|
||||||
self.extra_opts = extra_opts
|
self.extra_opts = extra_opts
|
||||||
|
|
||||||
def is_baen(self, src):
|
def is_baen(self, src):
|
||||||
@ -429,27 +457,20 @@ class HTMLPreProcessor(object):
|
|||||||
if not getattr(self.extra_opts, 'keep_ligatures', False):
|
if not getattr(self.extra_opts, 'keep_ligatures', False):
|
||||||
html = _ligpat.sub(lambda m:LIGATURES[m.group()], html)
|
html = _ligpat.sub(lambda m:LIGATURES[m.group()], html)
|
||||||
|
|
||||||
|
for search, replace in [['sr3_search', 'sr3_replace'], ['sr2_search', 'sr2_replace'], ['sr1_search', 'sr1_replace']]:
|
||||||
|
search_pattern = getattr(self.extra_opts, search, '')
|
||||||
|
if search_pattern:
|
||||||
|
try:
|
||||||
|
search_re = re.compile(search_pattern)
|
||||||
|
replace_txt = getattr(self.extra_opts, replace, '')
|
||||||
|
if not replace_txt:
|
||||||
|
replace_txt = ''
|
||||||
|
rules.insert(0, (search_re, replace_txt))
|
||||||
|
except Exception as e:
|
||||||
|
self.log.error('Failed to parse %r regexp because %s' %
|
||||||
|
(search, as_unicode(e)))
|
||||||
|
|
||||||
end_rules = []
|
end_rules = []
|
||||||
if getattr(self.extra_opts, 'remove_header', None):
|
|
||||||
try:
|
|
||||||
rules.insert(0,
|
|
||||||
(re.compile(self.extra_opts.header_regex), lambda match : '')
|
|
||||||
)
|
|
||||||
except:
|
|
||||||
import traceback
|
|
||||||
print 'Failed to parse remove_header regexp'
|
|
||||||
traceback.print_exc()
|
|
||||||
|
|
||||||
if getattr(self.extra_opts, 'remove_footer', None):
|
|
||||||
try:
|
|
||||||
rules.insert(0,
|
|
||||||
(re.compile(self.extra_opts.footer_regex), lambda match : '')
|
|
||||||
)
|
|
||||||
except:
|
|
||||||
import traceback
|
|
||||||
print 'Failed to parse remove_footer regexp'
|
|
||||||
traceback.print_exc()
|
|
||||||
|
|
||||||
# delete soft hyphens - moved here so it's executed after header/footer removal
|
# delete soft hyphens - moved here so it's executed after header/footer removal
|
||||||
if is_pdftohtml:
|
if is_pdftohtml:
|
||||||
# unwrap/delete soft hyphens
|
# unwrap/delete soft hyphens
|
||||||
@ -457,12 +478,6 @@ class HTMLPreProcessor(object):
|
|||||||
# unwrap/delete soft hyphens with formatting
|
# unwrap/delete soft hyphens with formatting
|
||||||
end_rules.append((re.compile(u'[]\s*(</(i|u|b)>)+(</p>\s*<p>\s*)+\s*(<(i|u|b)>)+\s*(?=[[a-z\d])'), lambda match: ''))
|
end_rules.append((re.compile(u'[]\s*(</(i|u|b)>)+(</p>\s*<p>\s*)+\s*(<(i|u|b)>)+\s*(?=[[a-z\d])'), lambda match: ''))
|
||||||
|
|
||||||
# Make the more aggressive chapter marking regex optional with the preprocess option to
|
|
||||||
# reduce false positives and move after header/footer removal
|
|
||||||
if getattr(self.extra_opts, 'preprocess_html', None):
|
|
||||||
if is_pdftohtml:
|
|
||||||
end_rules.append((re.compile(r'<p>\s*(?P<chap>(<[ibu]>){0,2}\s*([A-Z \'"!]{3,})\s*([\dA-Z:]+\s){0,4}\s*(</[ibu]>){0,2})\s*<p>\s*(?P<title>(<[ibu]>){0,2}(\s*\w+){1,4}\s*(</[ibu]>){0,2}\s*<p>)?'), chap_head),)
|
|
||||||
|
|
||||||
length = -1
|
length = -1
|
||||||
if getattr(self.extra_opts, 'unwrap_factor', 0.0) > 0.01:
|
if getattr(self.extra_opts, 'unwrap_factor', 0.0) > 0.01:
|
||||||
docanalysis = DocAnalysis('pdf', html)
|
docanalysis = DocAnalysis('pdf', html)
|
||||||
@ -473,7 +488,7 @@ class HTMLPreProcessor(object):
|
|||||||
end_rules.append((re.compile(u'(?<=.{%i}[–—])\s*<p>\s*(?=[[a-z\d])' % length), lambda match: ''))
|
end_rules.append((re.compile(u'(?<=.{%i}[–—])\s*<p>\s*(?=[[a-z\d])' % length), lambda match: ''))
|
||||||
end_rules.append(
|
end_rules.append(
|
||||||
# Un wrap using punctuation
|
# Un wrap using punctuation
|
||||||
(re.compile(u'(?<=.{%i}([a-zäëïöüàèìòùáćéíóńśúâêîôûçąężıãõñæøþðß,:)\IA\u00DF]|(?<!\&\w{4});))\s*(?P<ital></(i|b|u)>)?\s*(</p>\s*<p>\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines),
|
(re.compile(u'(?<=.{%i}([a-zäëïöüàèìòùáćéíóńśúâêîôûçąężıãõñæøþðßě,:)\IA\u00DF]|(?<!\&\w{4});))\s*(?P<ital></(i|b|u)>)?\s*(</p>\s*<p>\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines),
|
||||||
)
|
)
|
||||||
|
|
||||||
for rule in self.PREPROCESS + start_rules:
|
for rule in self.PREPROCESS + start_rules:
|
||||||
@ -505,15 +520,14 @@ class HTMLPreProcessor(object):
|
|||||||
|
|
||||||
if is_pdftohtml and length > -1:
|
if is_pdftohtml and length > -1:
|
||||||
# Dehyphenate
|
# Dehyphenate
|
||||||
dehyphenator = Dehyphenator()
|
dehyphenator = Dehyphenator(self.extra_opts.verbose, self.log)
|
||||||
html = dehyphenator(html,'html', length)
|
html = dehyphenator(html,'html', length)
|
||||||
|
|
||||||
if is_pdftohtml:
|
if is_pdftohtml:
|
||||||
from calibre.ebooks.conversion.utils import PreProcessor
|
from calibre.ebooks.conversion.utils import HeuristicProcessor
|
||||||
pdf_markup = PreProcessor(self.extra_opts, None)
|
pdf_markup = HeuristicProcessor(self.extra_opts, None)
|
||||||
totalwords = 0
|
totalwords = 0
|
||||||
totalwords = pdf_markup.get_word_count(html)
|
if pdf_markup.get_word_count(html) > 7000:
|
||||||
if totalwords > 7000:
|
|
||||||
html = pdf_markup.markup_chapters(html, totalwords, True)
|
html = pdf_markup.markup_chapters(html, totalwords, True)
|
||||||
|
|
||||||
#dump(html, 'post-preprocess')
|
#dump(html, 'post-preprocess')
|
||||||
@ -533,8 +547,10 @@ class HTMLPreProcessor(object):
|
|||||||
unidecoder = Unidecoder()
|
unidecoder = Unidecoder()
|
||||||
html = unidecoder.decode(html)
|
html = unidecoder.decode(html)
|
||||||
|
|
||||||
if self.plugin_preprocess:
|
if getattr(self.extra_opts, 'enable_heuristics', False):
|
||||||
html = self.input_plugin_preprocess(self.extra_opts, html)
|
from calibre.ebooks.conversion.utils import HeuristicProcessor
|
||||||
|
preprocessor = HeuristicProcessor(self.extra_opts, self.log)
|
||||||
|
html = preprocessor(html)
|
||||||
|
|
||||||
if getattr(self.extra_opts, 'smarten_punctuation', False):
|
if getattr(self.extra_opts, 'smarten_punctuation', False):
|
||||||
html = self.smarten_punctuation(html)
|
html = self.smarten_punctuation(html)
|
||||||
@ -561,8 +577,8 @@ class HTMLPreProcessor(object):
|
|||||||
html = html.replace(start, '<!--')
|
html = html.replace(start, '<!--')
|
||||||
html = html.replace(stop, '-->')
|
html = html.replace(stop, '-->')
|
||||||
# convert ellipsis to entities to prevent wrapping
|
# convert ellipsis to entities to prevent wrapping
|
||||||
html = re.sub('(?u)(?<=\w)\s?(\.\s?){2}\.', '…', html)
|
html = re.sub(r'(?u)(?<=\w)\s?(\.\s?){2}\.', '…', html)
|
||||||
# convert double dashes to em-dash
|
# convert double dashes to em-dash
|
||||||
html = re.sub('\s--\s', u'\u2014', html)
|
html = re.sub(r'\s--\s', u'\u2014', html)
|
||||||
return substitute_entites(html)
|
return substitute_entites(html)
|
||||||
|
|
||||||
|
@ -11,13 +11,22 @@ from calibre.ebooks.conversion.preprocess import DocAnalysis, Dehyphenator
|
|||||||
from calibre.utils.logging import default_log
|
from calibre.utils.logging import default_log
|
||||||
from calibre.utils.wordcount import get_wordcount_obj
|
from calibre.utils.wordcount import get_wordcount_obj
|
||||||
|
|
||||||
class PreProcessor(object):
|
class HeuristicProcessor(object):
|
||||||
|
|
||||||
def __init__(self, extra_opts=None, log=None):
|
def __init__(self, extra_opts=None, log=None):
|
||||||
self.log = default_log if log is None else log
|
self.log = default_log if log is None else log
|
||||||
self.html_preprocess_sections = 0
|
self.html_preprocess_sections = 0
|
||||||
self.found_indents = 0
|
self.found_indents = 0
|
||||||
self.extra_opts = extra_opts
|
self.extra_opts = extra_opts
|
||||||
|
self.deleted_nbsps = False
|
||||||
|
self.totalwords = 0
|
||||||
|
self.min_chapters = 1
|
||||||
|
self.chapters_no_title = 0
|
||||||
|
self.chapters_with_title = 0
|
||||||
|
self.blanks_deleted = False
|
||||||
|
self.linereg = re.compile('(?<=<p).*?(?=</p>)', re.IGNORECASE|re.DOTALL)
|
||||||
|
self.blankreg = re.compile(r'\s*(?P<openline><p(?!\sid=\"softbreak\")[^>]*>)\s*(?P<closeline></p>)', re.IGNORECASE)
|
||||||
|
self.multi_blank = re.compile(r'(\s*<p[^>]*>\s*</p>){2,}', re.IGNORECASE)
|
||||||
|
|
||||||
def is_pdftohtml(self, src):
|
def is_pdftohtml(self, src):
|
||||||
return '<!-- created by calibre\'s pdftohtml -->' in src[:1000]
|
return '<!-- created by calibre\'s pdftohtml -->' in src[:1000]
|
||||||
@ -27,12 +36,12 @@ class PreProcessor(object):
|
|||||||
title = match.group('title')
|
title = match.group('title')
|
||||||
if not title:
|
if not title:
|
||||||
self.html_preprocess_sections = self.html_preprocess_sections + 1
|
self.html_preprocess_sections = self.html_preprocess_sections + 1
|
||||||
self.log("marked " + unicode(self.html_preprocess_sections) +
|
self.log.debug("marked " + unicode(self.html_preprocess_sections) +
|
||||||
" chapters. - " + unicode(chap))
|
" chapters. - " + unicode(chap))
|
||||||
return '<h2>'+chap+'</h2>\n'
|
return '<h2>'+chap+'</h2>\n'
|
||||||
else:
|
else:
|
||||||
self.html_preprocess_sections = self.html_preprocess_sections + 1
|
self.html_preprocess_sections = self.html_preprocess_sections + 1
|
||||||
self.log("marked " + unicode(self.html_preprocess_sections) +
|
self.log.debug("marked " + unicode(self.html_preprocess_sections) +
|
||||||
" chapters & titles. - " + unicode(chap) + ", " + unicode(title))
|
" chapters & titles. - " + unicode(chap) + ", " + unicode(title))
|
||||||
return '<h2>'+chap+'</h2>\n<h3>'+title+'</h3>\n'
|
return '<h2>'+chap+'</h2>\n<h3>'+title+'</h3>\n'
|
||||||
|
|
||||||
@ -40,10 +49,18 @@ class PreProcessor(object):
|
|||||||
chap = match.group('section')
|
chap = match.group('section')
|
||||||
styles = match.group('styles')
|
styles = match.group('styles')
|
||||||
self.html_preprocess_sections = self.html_preprocess_sections + 1
|
self.html_preprocess_sections = self.html_preprocess_sections + 1
|
||||||
self.log("marked " + unicode(self.html_preprocess_sections) +
|
self.log.debug("marked " + unicode(self.html_preprocess_sections) +
|
||||||
" section markers based on punctuation. - " + unicode(chap))
|
" section markers based on punctuation. - " + unicode(chap))
|
||||||
return '<'+styles+' style="page-break-before:always">'+chap
|
return '<'+styles+' style="page-break-before:always">'+chap
|
||||||
|
|
||||||
|
def analyze_title_matches(self, match):
|
||||||
|
#chap = match.group('chap')
|
||||||
|
title = match.group('title')
|
||||||
|
if not title:
|
||||||
|
self.chapters_no_title = self.chapters_no_title + 1
|
||||||
|
else:
|
||||||
|
self.chapters_with_title = self.chapters_with_title + 1
|
||||||
|
|
||||||
def insert_indent(self, match):
|
def insert_indent(self, match):
|
||||||
pstyle = match.group('formatting')
|
pstyle = match.group('formatting')
|
||||||
span = match.group('span')
|
span = match.group('span')
|
||||||
@ -75,8 +92,8 @@ class PreProcessor(object):
|
|||||||
line_end = line_end_ere.findall(raw)
|
line_end = line_end_ere.findall(raw)
|
||||||
tot_htm_ends = len(htm_end)
|
tot_htm_ends = len(htm_end)
|
||||||
tot_ln_fds = len(line_end)
|
tot_ln_fds = len(line_end)
|
||||||
self.log("There are " + unicode(tot_ln_fds) + " total Line feeds, and " +
|
#self.log.debug("There are " + unicode(tot_ln_fds) + " total Line feeds, and " +
|
||||||
unicode(tot_htm_ends) + " marked up endings")
|
# unicode(tot_htm_ends) + " marked up endings")
|
||||||
|
|
||||||
if percent > 1:
|
if percent > 1:
|
||||||
percent = 1
|
percent = 1
|
||||||
@ -84,9 +101,8 @@ class PreProcessor(object):
|
|||||||
percent = 0
|
percent = 0
|
||||||
|
|
||||||
min_lns = tot_ln_fds * percent
|
min_lns = tot_ln_fds * percent
|
||||||
self.log("There must be fewer than " + unicode(min_lns) + " unmarked lines to add markup")
|
#self.log.debug("There must be fewer than " + unicode(min_lns) + " unmarked lines to add markup")
|
||||||
if min_lns > tot_htm_ends:
|
return min_lns > tot_htm_ends
|
||||||
return True
|
|
||||||
|
|
||||||
def dump(self, raw, where):
|
def dump(self, raw, where):
|
||||||
import os
|
import os
|
||||||
@ -112,16 +128,55 @@ class PreProcessor(object):
|
|||||||
wordcount = get_wordcount_obj(word_count_text)
|
wordcount = get_wordcount_obj(word_count_text)
|
||||||
return wordcount.words
|
return wordcount.words
|
||||||
|
|
||||||
|
def markup_italicis(self, html):
|
||||||
|
ITALICIZE_WORDS = [
|
||||||
|
'Etc.', 'etc.', 'viz.', 'ie.', 'i.e.', 'Ie.', 'I.e.', 'eg.',
|
||||||
|
'e.g.', 'Eg.', 'E.g.', 'et al.', 'et cetera', 'n.b.', 'N.b.',
|
||||||
|
'nota bene', 'Nota bene', 'Ste.', 'Mme.', 'Mdme.',
|
||||||
|
'Mlle.', 'Mons.', 'PS.', 'PPS.',
|
||||||
|
]
|
||||||
|
|
||||||
|
ITALICIZE_STYLE_PATS = [
|
||||||
|
r'(?msu)(?<=\s)_(?P<words>\S[^_]{0,40}?\S)?_(?=\s)',
|
||||||
|
r'(?msu)(?<=\s)/(?P<words>\S[^/]{0,40}?\S)?/(?=\s)',
|
||||||
|
r'(?msu)(?<=\s)~~(?P<words>\S[^~]{0,40}?\S)?~~(?=\s)',
|
||||||
|
r'(?msu)(?<=\s)\*(?P<words>\S[^\*]{0,40}?\S)?\*(?=\s)',
|
||||||
|
r'(?msu)(?<=\s)~(?P<words>\S[^~]{0,40}?\S)?~(?=\s)',
|
||||||
|
r'(?msu)(?<=\s)_/(?P<words>\S[^/_]{0,40}?\S)?/_(?=\s)',
|
||||||
|
r'(?msu)(?<=\s)_\*(?P<words>\S[^\*_]{0,40}?\S)?\*_(?=\s)',
|
||||||
|
r'(?msu)(?<=\s)\*/(?P<words>\S[^/\*]{0,40}?\S)?/\*(?=\s)',
|
||||||
|
r'(?msu)(?<=\s)_\*/(?P<words>\S[^\*_]{0,40}?\S)?/\*_(?=\s)',
|
||||||
|
r'(?msu)(?<=\s)/:(?P<words>\S[^:/]{0,40}?\S)?:/(?=\s)',
|
||||||
|
r'(?msu)(?<=\s)\|:(?P<words>\S[^:\|]{0,40}?\S)?:\|(?=\s)',
|
||||||
|
]
|
||||||
|
|
||||||
|
for word in ITALICIZE_WORDS:
|
||||||
|
html = html.replace(word, '<i>%s</i>' % word)
|
||||||
|
|
||||||
|
for pat in ITALICIZE_STYLE_PATS:
|
||||||
|
html = re.sub(pat, lambda mo: '<i>%s</i>' % mo.group('words'), html)
|
||||||
|
|
||||||
|
return html
|
||||||
|
|
||||||
def markup_chapters(self, html, wordcount, blanks_between_paragraphs):
|
def markup_chapters(self, html, wordcount, blanks_between_paragraphs):
|
||||||
|
'''
|
||||||
|
Searches for common chapter headings throughout the document
|
||||||
|
attempts multiple patterns based on likelihood of a match
|
||||||
|
with minimum false positives. Exits after finding a successful pattern
|
||||||
|
'''
|
||||||
# Typical chapters are between 2000 and 7000 words, use the larger number to decide the
|
# Typical chapters are between 2000 and 7000 words, use the larger number to decide the
|
||||||
# minimum of chapters to search for
|
# minimum of chapters to search for. A max limit is calculated to prevent things like OCR
|
||||||
self.min_chapters = 1
|
# or pdf page numbers from being treated as TOC markers
|
||||||
|
max_chapters = 150
|
||||||
|
typical_chapters = 7000.
|
||||||
if wordcount > 7000:
|
if wordcount > 7000:
|
||||||
self.min_chapters = int(ceil(wordcount / 7000.))
|
if wordcount > 200000:
|
||||||
#print "minimum chapters required are: "+str(self.min_chapters)
|
typical_chapters = 15000.
|
||||||
|
self.min_chapters = int(ceil(wordcount / typical_chapters))
|
||||||
|
self.log.debug("minimum chapters required are: "+str(self.min_chapters))
|
||||||
heading = re.compile('<h[1-3][^>]*>', re.IGNORECASE)
|
heading = re.compile('<h[1-3][^>]*>', re.IGNORECASE)
|
||||||
self.html_preprocess_sections = len(heading.findall(html))
|
self.html_preprocess_sections = len(heading.findall(html))
|
||||||
self.log("found " + unicode(self.html_preprocess_sections) + " pre-existing headings")
|
self.log.debug("found " + unicode(self.html_preprocess_sections) + " pre-existing headings")
|
||||||
|
|
||||||
# Build the Regular Expressions in pieces
|
# Build the Regular Expressions in pieces
|
||||||
init_lookahead = "(?=<(p|div))"
|
init_lookahead = "(?=<(p|div))"
|
||||||
@ -151,88 +206,160 @@ class PreProcessor(object):
|
|||||||
n_lookahead_open = "\s+(?!"
|
n_lookahead_open = "\s+(?!"
|
||||||
n_lookahead_close = ")"
|
n_lookahead_close = ")"
|
||||||
|
|
||||||
default_title = r"(<[ibu][^>]*>)?\s{0,3}([\w\'\"-]+\s{0,3}){1,5}?(</[ibu][^>]*>)?(?=<)"
|
default_title = r"(<[ibu][^>]*>)?\s{0,3}(?!Chapter)([\w\:\'’\"-]+\s{0,3}){1,5}?(</[ibu][^>]*>)?(?=<)"
|
||||||
|
simple_title = r"(<[ibu][^>]*>)?\s{0,3}(?!(Chapter|\s+<)).{0,65}?(</[ibu][^>]*>)?(?=<)"
|
||||||
|
|
||||||
|
analysis_result = []
|
||||||
|
|
||||||
chapter_types = [
|
chapter_types = [
|
||||||
[r"[^'\"]?(Introduction|Synopsis|Acknowledgements|Chapter|Kapitel|Epilogue|Volume\s|Prologue|Book\s|Part\s|Dedication|Preface)\s*([\d\w-]+\:?\s*){0,4}", True, "Searching for common Chapter Headings"],
|
[r"[^'\"]?(Introduction|Synopsis|Acknowledgements|Epilogue|CHAPTER|Kapitel|Volume\b|Prologue|Book\b|Part\b|Dedication|Preface)\s*([\d\w-]+\:?\'?\s*){0,5}", True, True, True, False, "Searching for common section headings", 'common'],
|
||||||
[r"<b[^>]*>\s*(<span[^>]*>)?\s*(?!([*#•]+\s*)+)(\s*(?=[\d.\w#\-*\s]+<)([\d.\w#-*]+\s*){1,5}\s*)(?!\.)(</span>)?\s*</b>", True, "Searching for emphasized lines"], # Emphasized lines
|
[r"[^'\"]?(CHAPTER|Kapitel)\s*([\dA-Z\-\'\"\?!#,]+\s*){0,7}\s*", True, True, True, False, "Searching for most common chapter headings", 'chapter'], # Highest frequency headings which include titles
|
||||||
[r"[^'\"]?(\d+(\.|:)|CHAPTER)\s*([\dA-Z\-\'\"#,]+\s*){0,7}\s*", True, "Searching for numeric chapter headings"], # Numeric Chapters
|
[r"<b[^>]*>\s*(<span[^>]*>)?\s*(?!([*#•=]+\s*)+)(\s*(?=[\d.\w#\-*\s]+<)([\d.\w#-*]+\s*){1,5}\s*)(?!\.)(</span>)?\s*</b>", True, True, True, False, "Searching for emphasized lines", 'emphasized'], # Emphasized lines
|
||||||
[r"([A-Z]\s+){3,}\s*([\d\w-]+\s*){0,3}\s*", True, "Searching for letter spaced headings"], # Spaced Lettering
|
[r"[^'\"]?(\d+(\.|:))\s*([\dA-Z\-\'\"#,]+\s*){0,7}\s*", True, True, True, False, "Searching for numeric chapter headings", 'numeric'], # Numeric Chapters
|
||||||
[r"[^'\"]?(\d+\.?\s+([\d\w-]+\:?\'?-?\s?){0,5})\s*", True, "Searching for numeric chapters with titles"], # Numeric Titles
|
[r"([A-Z]\s+){3,}\s*([\d\w-]+\s*){0,3}\s*", True, True, True, False, "Searching for letter spaced headings", 'letter_spaced'], # Spaced Lettering
|
||||||
[r"[^'\"]?(\d+|CHAPTER)\s*([\dA-Z\-\'\"\?!#,]+\s*){0,7}\s*", True, "Searching for simple numeric chapter headings"], # Numeric Chapters, no dot or colon
|
[r"[^'\"]?(\d+\.?\s+([\d\w-]+\:?\'?-?\s?){0,5})\s*", True, True, True, False, "Searching for numeric chapters with titles", 'numeric_title'], # Numeric Titles
|
||||||
[r"\s*[^'\"]?([A-Z#]+(\s|-){0,3}){1,5}\s*", False, "Searching for chapters with Uppercase Characters" ] # Uppercase Chapters
|
[r"[^'\"]?(\d+)\s*([\dA-Z\-\'\"\?!#,]+\s*){0,7}\s*", True, True, True, False, "Searching for simple numeric headings", 'plain_number'], # Numeric Chapters, no dot or colon
|
||||||
|
[r"\s*[^'\"]?([A-Z#]+(\s|-){0,3}){1,5}\s*", False, True, False, False, "Searching for chapters with Uppercase Characters", 'uppercase' ] # Uppercase Chapters
|
||||||
]
|
]
|
||||||
|
|
||||||
|
def recurse_patterns(html, analyze):
|
||||||
# Start with most typical chapter headings, get more aggressive until one works
|
# Start with most typical chapter headings, get more aggressive until one works
|
||||||
for [chapter_type, lookahead_ignorecase, log_message] in chapter_types:
|
for [chapter_type, n_lookahead_req, strict_title, ignorecase, title_req, log_message, type_name] in chapter_types:
|
||||||
|
n_lookahead = ''
|
||||||
|
hits = 0
|
||||||
|
self.chapters_no_title = 0
|
||||||
|
self.chapters_with_title = 0
|
||||||
|
|
||||||
|
if n_lookahead_req:
|
||||||
|
lp_n_lookahead_open = n_lookahead_open
|
||||||
|
lp_n_lookahead_close = n_lookahead_close
|
||||||
|
else:
|
||||||
|
lp_n_lookahead_open = ''
|
||||||
|
lp_n_lookahead_close = ''
|
||||||
|
|
||||||
|
if strict_title:
|
||||||
|
lp_title = default_title
|
||||||
|
else:
|
||||||
|
lp_title = simple_title
|
||||||
|
|
||||||
|
if ignorecase:
|
||||||
|
arg_ignorecase = r'(?i)'
|
||||||
|
else:
|
||||||
|
arg_ignorecase = ''
|
||||||
|
|
||||||
|
if title_req:
|
||||||
|
lp_opt_title_open = ''
|
||||||
|
lp_opt_title_close = ''
|
||||||
|
else:
|
||||||
|
lp_opt_title_open = opt_title_open
|
||||||
|
lp_opt_title_close = opt_title_close
|
||||||
|
|
||||||
if self.html_preprocess_sections >= self.min_chapters:
|
if self.html_preprocess_sections >= self.min_chapters:
|
||||||
break
|
break
|
||||||
full_chapter_line = chapter_line_open+chapter_header_open+chapter_type+chapter_header_close+chapter_line_close
|
full_chapter_line = chapter_line_open+chapter_header_open+chapter_type+chapter_header_close+chapter_line_close
|
||||||
|
if n_lookahead_req:
|
||||||
n_lookahead = re.sub("(ou|in|cha)", "lookahead_", full_chapter_line)
|
n_lookahead = re.sub("(ou|in|cha)", "lookahead_", full_chapter_line)
|
||||||
self.log("Marked " + unicode(self.html_preprocess_sections) + " headings, " + log_message)
|
if not analyze:
|
||||||
if lookahead_ignorecase:
|
self.log.debug("Marked " + unicode(self.html_preprocess_sections) + " headings, " + log_message)
|
||||||
chapter_marker = init_lookahead+full_chapter_line+blank_lines+n_lookahead_open+n_lookahead+n_lookahead_close+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close
|
|
||||||
chapdetect = re.compile(r'%s' % chapter_marker, re.IGNORECASE)
|
chapter_marker = arg_ignorecase+init_lookahead+full_chapter_line+blank_lines+lp_n_lookahead_open+n_lookahead+lp_n_lookahead_close+lp_opt_title_open+title_line_open+title_header_open+lp_title+title_header_close+title_line_close+lp_opt_title_close
|
||||||
|
chapdetect = re.compile(r'%s' % chapter_marker)
|
||||||
|
|
||||||
|
if analyze:
|
||||||
|
hits = len(chapdetect.findall(html))
|
||||||
|
if hits:
|
||||||
|
chapdetect.sub(self.analyze_title_matches, html)
|
||||||
|
if float(self.chapters_with_title) / float(hits) > .5:
|
||||||
|
title_req = True
|
||||||
|
strict_title = False
|
||||||
|
self.log.debug(unicode(type_name)+" had "+unicode(hits)+" hits - "+unicode(self.chapters_no_title)+" chapters with no title, "+unicode(self.chapters_with_title)+" chapters with titles, "+unicode(float(self.chapters_with_title) / float(hits))+" percent. ")
|
||||||
|
if type_name == 'common':
|
||||||
|
analysis_result.append([chapter_type, n_lookahead_req, strict_title, ignorecase, title_req, log_message, type_name])
|
||||||
|
elif self.min_chapters <= hits < max_chapters:
|
||||||
|
analysis_result.append([chapter_type, n_lookahead_req, strict_title, ignorecase, title_req, log_message, type_name])
|
||||||
|
break
|
||||||
else:
|
else:
|
||||||
chapter_marker = init_lookahead+full_chapter_line+blank_lines+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close+n_lookahead_open+n_lookahead+n_lookahead_close
|
|
||||||
chapdetect = re.compile(r'%s' % chapter_marker, re.UNICODE)
|
|
||||||
html = chapdetect.sub(self.chapter_head, html)
|
html = chapdetect.sub(self.chapter_head, html)
|
||||||
|
return html
|
||||||
|
|
||||||
|
recurse_patterns(html, True)
|
||||||
|
chapter_types = analysis_result
|
||||||
|
html = recurse_patterns(html, False)
|
||||||
|
|
||||||
words_per_chptr = wordcount
|
words_per_chptr = wordcount
|
||||||
if words_per_chptr > 0 and self.html_preprocess_sections > 0:
|
if words_per_chptr > 0 and self.html_preprocess_sections > 0:
|
||||||
words_per_chptr = wordcount / self.html_preprocess_sections
|
words_per_chptr = wordcount / self.html_preprocess_sections
|
||||||
self.log("Total wordcount is: "+ str(wordcount)+", Average words per section is: "+str(words_per_chptr)+", Marked up "+str(self.html_preprocess_sections)+" chapters")
|
self.log.debug("Total wordcount is: "+ str(wordcount)+", Average words per section is: "+str(words_per_chptr)+", Marked up "+str(self.html_preprocess_sections)+" chapters")
|
||||||
return html
|
return html
|
||||||
|
|
||||||
|
def punctuation_unwrap(self, length, content, format):
|
||||||
|
'''
|
||||||
|
Unwraps lines based on line length and punctuation
|
||||||
|
supports a range of html markup and text files
|
||||||
|
'''
|
||||||
|
# define the pieces of the regex
|
||||||
|
lookahead = "(?<=.{"+str(length)+u"}([a-zäëïöüàèìòùáćéíóńśúâêîôûçąężıãõñæøþðßě,:)\IA\u00DF]|(?<!\&\w{4});))" # (?<!\&\w{4});) is a semicolon not part of an entity
|
||||||
|
em_en_lookahead = "(?<=.{"+str(length)+u"}[\u2013\u2014])"
|
||||||
|
soft_hyphen = u"\xad"
|
||||||
|
line_ending = "\s*</(span|[iubp]|div)>\s*(</(span|[iubp]|div)>)?"
|
||||||
|
blanklines = "\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*"
|
||||||
|
line_opening = "<(span|[iubp]|div)[^>]*>\s*(<(span|[iubp]|div)[^>]*>)?\s*"
|
||||||
|
txt_line_wrap = u"((\u0020|\u0009)*\n){1,4}"
|
||||||
|
|
||||||
|
unwrap_regex = lookahead+line_ending+blanklines+line_opening
|
||||||
|
em_en_unwrap_regex = em_en_lookahead+line_ending+blanklines+line_opening
|
||||||
|
shy_unwrap_regex = soft_hyphen+line_ending+blanklines+line_opening
|
||||||
|
|
||||||
def __call__(self, html):
|
if format == 'txt':
|
||||||
self.log("********* Preprocessing HTML *********")
|
unwrap_regex = lookahead+txt_line_wrap
|
||||||
|
em_en_unwrap_regex = em_en_lookahead+txt_line_wrap
|
||||||
|
shy_unwrap_regex = soft_hyphen+txt_line_wrap
|
||||||
|
|
||||||
# Count the words in the document to estimate how many chapters to look for and whether
|
unwrap = re.compile(u"%s" % unwrap_regex, re.UNICODE)
|
||||||
# other types of processing are attempted
|
em_en_unwrap = re.compile(u"%s" % em_en_unwrap_regex, re.UNICODE)
|
||||||
totalwords = 0
|
shy_unwrap = re.compile(u"%s" % shy_unwrap_regex, re.UNICODE)
|
||||||
totalwords = self.get_word_count(html)
|
|
||||||
|
|
||||||
if totalwords < 20:
|
content = unwrap.sub(' ', content)
|
||||||
self.log("not enough text, not preprocessing")
|
content = em_en_unwrap.sub('', content)
|
||||||
return html
|
content = shy_unwrap.sub('', content)
|
||||||
|
return content
|
||||||
|
|
||||||
# Arrange line feeds and </p> tags so the line_length and no_markup functions work correctly
|
def txt_process(self, match):
|
||||||
html = re.sub(r"\s*</(?P<tag>p|div)>", "</"+"\g<tag>"+">\n", html)
|
|
||||||
html = re.sub(r"\s*<(?P<tag>p|div)(?P<style>[^>]*)>\s*", "\n<"+"\g<tag>"+"\g<style>"+">", html)
|
|
||||||
|
|
||||||
###### Check Markup ######
|
|
||||||
#
|
|
||||||
# some lit files don't have any <p> tags or equivalent (generally just plain text between
|
|
||||||
# <pre> tags), check and mark up line endings if required before proceeding
|
|
||||||
if self.no_markup(html, 0.1):
|
|
||||||
self.log("not enough paragraph markers, adding now")
|
|
||||||
# check if content is in pre tags, use txt processor to mark up if so
|
|
||||||
pre = re.compile(r'<pre>', re.IGNORECASE)
|
|
||||||
if len(pre.findall(html)) == 1:
|
|
||||||
self.log("Running Text Processing")
|
|
||||||
from calibre.ebooks.txt.processor import convert_basic, preserve_spaces, \
|
from calibre.ebooks.txt.processor import convert_basic, preserve_spaces, \
|
||||||
separate_paragraphs_single_line
|
separate_paragraphs_single_line
|
||||||
outerhtml = re.compile(r'.*?(?<=<pre>)(?P<text>.*)(?=</pre>).*', re.IGNORECASE|re.DOTALL)
|
content = match.group('text')
|
||||||
html = outerhtml.sub('\g<text>', html)
|
content = separate_paragraphs_single_line(content)
|
||||||
html = separate_paragraphs_single_line(html)
|
content = preserve_spaces(content)
|
||||||
html = preserve_spaces(html)
|
content = convert_basic(content, epub_split_size_kb=0)
|
||||||
html = convert_basic(html, epub_split_size_kb=0)
|
return content
|
||||||
|
|
||||||
|
def markup_pre(self, html):
|
||||||
|
pre = re.compile(r'<pre>', re.IGNORECASE)
|
||||||
|
if len(pre.findall(html)) >= 1:
|
||||||
|
self.log.debug("Running Text Processing")
|
||||||
|
outerhtml = re.compile(r'.*?(?<=<pre>)(?P<text>.*?)</pre>', re.IGNORECASE|re.DOTALL)
|
||||||
|
html = outerhtml.sub(self.txt_process, html)
|
||||||
else:
|
else:
|
||||||
# Add markup naively
|
# Add markup naively
|
||||||
# TODO - find out if there are cases where there are more than one <pre> tag or
|
# TODO - find out if there are cases where there are more than one <pre> tag or
|
||||||
# other types of unmarked html and handle them in some better fashion
|
# other types of unmarked html and handle them in some better fashion
|
||||||
add_markup = re.compile('(?<!>)(\n)')
|
add_markup = re.compile('(?<!>)(\n)')
|
||||||
html = add_markup.sub('</p>\n<p>', html)
|
html = add_markup.sub('</p>\n<p>', html)
|
||||||
|
return html
|
||||||
|
|
||||||
###### Mark Indents/Cleanup ######
|
def arrange_htm_line_endings(self, html):
|
||||||
#
|
html = re.sub(r"\s*</(?P<tag>p|div)>", "</"+"\g<tag>"+">\n", html)
|
||||||
# Replace series of non-breaking spaces with text-indent
|
html = re.sub(r"\s*<(?P<tag>p|div)(?P<style>[^>]*)>\s*", "\n<"+"\g<tag>"+"\g<style>"+">", html)
|
||||||
|
return html
|
||||||
|
|
||||||
|
def fix_nbsp_indents(self, html):
|
||||||
txtindent = re.compile(ur'<p(?P<formatting>[^>]*)>\s*(?P<span>(<span[^>]*>\s*)+)?\s*(\u00a0){2,}', re.IGNORECASE)
|
txtindent = re.compile(ur'<p(?P<formatting>[^>]*)>\s*(?P<span>(<span[^>]*>\s*)+)?\s*(\u00a0){2,}', re.IGNORECASE)
|
||||||
html = txtindent.sub(self.insert_indent, html)
|
html = txtindent.sub(self.insert_indent, html)
|
||||||
if self.found_indents > 1:
|
if self.found_indents > 1:
|
||||||
self.log("replaced "+unicode(self.found_indents)+ " nbsp indents with inline styles")
|
self.log.debug("replaced "+unicode(self.found_indents)+ " nbsp indents with inline styles")
|
||||||
|
return html
|
||||||
|
|
||||||
|
def cleanup_markup(self, html):
|
||||||
# remove remaining non-breaking spaces
|
# remove remaining non-breaking spaces
|
||||||
html = re.sub(ur'\u00a0', ' ', html)
|
html = re.sub(ur'\u00a0', ' ', html)
|
||||||
# Get rid of various common microsoft specific tags which can cause issues later
|
# Get rid of various common microsoft specific tags which can cause issues later
|
||||||
@ -240,109 +367,166 @@ class PreProcessor(object):
|
|||||||
html = re.sub(ur'\s*<o:p>\s*</o:p>', ' ', html)
|
html = re.sub(ur'\s*<o:p>\s*</o:p>', ' ', html)
|
||||||
# Delete microsoft 'smart' tags
|
# Delete microsoft 'smart' tags
|
||||||
html = re.sub('(?i)</?st1:\w+>', '', html)
|
html = re.sub('(?i)</?st1:\w+>', '', html)
|
||||||
# Get rid of empty span, bold, & italics tags
|
# Get rid of empty span, bold, font, em, & italics tags
|
||||||
html = re.sub(r"\s*<span[^>]*>\s*(<span[^>]*>\s*</span>){0,2}\s*</span>\s*", " ", html)
|
html = re.sub(r"\s*<span[^>]*>\s*(<span[^>]*>\s*</span>){0,2}\s*</span>\s*", " ", html)
|
||||||
html = re.sub(r"\s*<[ibu][^>]*>\s*(<[ibu][^>]*>\s*</[ibu]>\s*){0,2}\s*</[ibu]>", " ", html)
|
html = re.sub(r"\s*<(font|[ibu]|em)[^>]*>\s*(<(font|[ibu]|em)[^>]*>\s*</(font|[ibu]|em)>\s*){0,2}\s*</(font|[ibu]|em)>", " ", html)
|
||||||
html = re.sub(r"\s*<span[^>]*>\s*(<span[^>]>\s*</span>){0,2}\s*</span>\s*", " ", html)
|
html = re.sub(r"\s*<span[^>]*>\s*(<span[^>]>\s*</span>){0,2}\s*</span>\s*", " ", html)
|
||||||
# ADE doesn't render <br />, change to empty paragraphs
|
html = re.sub(r"\s*<(font|[ibu]|em)[^>]*>\s*(<(font|[ibu]|em)[^>]*>\s*</(font|[ibu]|em)>\s*){0,2}\s*</(font|[ibu]|em)>", " ", html)
|
||||||
#html = re.sub('<br[^>]*>', u'<p>\u00a0</p>', html)
|
self.deleted_nbsps = True
|
||||||
|
return html
|
||||||
|
|
||||||
# If more than 40% of the lines are empty paragraphs and the user has enabled remove
|
def analyze_line_endings(self, html):
|
||||||
# paragraph spacing then delete blank lines to clean up spacing
|
'''
|
||||||
linereg = re.compile('(?<=<p).*?(?=</p>)', re.IGNORECASE|re.DOTALL)
|
determines the type of html line ending used most commonly in a document
|
||||||
blankreg = re.compile(r'\s*(?P<openline><p[^>]*>)\s*(?P<closeline></p>)', re.IGNORECASE)
|
use before calling docanalysis functions
|
||||||
#multi_blank = re.compile(r'(\s*<p[^>]*>\s*(<(b|i|u)>)?\s*(</(b|i|u)>)?\s*</p>){2,}', re.IGNORECASE)
|
'''
|
||||||
blanklines = blankreg.findall(html)
|
|
||||||
lines = linereg.findall(html)
|
|
||||||
blanks_between_paragraphs = False
|
|
||||||
if len(lines) > 1:
|
|
||||||
self.log("There are " + unicode(len(blanklines)) + " blank lines. " +
|
|
||||||
unicode(float(len(blanklines)) / float(len(lines))) + " percent blank")
|
|
||||||
if float(len(blanklines)) / float(len(lines)) > 0.40 and getattr(self.extra_opts,
|
|
||||||
'remove_paragraph_spacing', False):
|
|
||||||
self.log("deleting blank lines")
|
|
||||||
html = blankreg.sub('', html)
|
|
||||||
elif float(len(blanklines)) / float(len(lines)) > 0.40:
|
|
||||||
blanks_between_paragraphs = True
|
|
||||||
#print "blanks between paragraphs is marked True"
|
|
||||||
else:
|
|
||||||
blanks_between_paragraphs = False
|
|
||||||
|
|
||||||
#self.dump(html, 'before_chapter_markup')
|
|
||||||
# detect chapters/sections to match xpath or splitting logic
|
|
||||||
#
|
|
||||||
|
|
||||||
html = self.markup_chapters(html, totalwords, blanks_between_paragraphs)
|
|
||||||
|
|
||||||
|
|
||||||
###### Unwrap lines ######
|
|
||||||
#
|
|
||||||
# Some OCR sourced files have line breaks in the html using a combination of span & p tags
|
|
||||||
# span are used for hard line breaks, p for new paragraphs. Determine which is used so
|
|
||||||
# that lines can be un-wrapped across page boundaries
|
|
||||||
paras_reg = re.compile('<p[^>]*>', re.IGNORECASE)
|
paras_reg = re.compile('<p[^>]*>', re.IGNORECASE)
|
||||||
spans_reg = re.compile('<span[^>]*>', re.IGNORECASE)
|
spans_reg = re.compile('<span[^>]*>', re.IGNORECASE)
|
||||||
paras = len(paras_reg.findall(html))
|
paras = len(paras_reg.findall(html))
|
||||||
spans = len(spans_reg.findall(html))
|
spans = len(spans_reg.findall(html))
|
||||||
if spans > 1:
|
if spans > 1:
|
||||||
if float(paras) / float(spans) < 0.75:
|
if float(paras) / float(spans) < 0.75:
|
||||||
format = 'spanned_html'
|
return 'spanned_html'
|
||||||
else:
|
else:
|
||||||
format = 'html'
|
return 'html'
|
||||||
else:
|
else:
|
||||||
format = 'html'
|
return 'html'
|
||||||
|
|
||||||
|
def analyze_blanks(self, html):
|
||||||
|
blanklines = self.blankreg.findall(html)
|
||||||
|
lines = self.linereg.findall(html)
|
||||||
|
if len(lines) > 1:
|
||||||
|
self.log.debug("There are " + unicode(len(blanklines)) + " blank lines. " +
|
||||||
|
unicode(float(len(blanklines)) / float(len(lines))) + " percent blank")
|
||||||
|
|
||||||
|
if float(len(blanklines)) / float(len(lines)) > 0.40:
|
||||||
|
return True
|
||||||
|
else:
|
||||||
|
return False
|
||||||
|
|
||||||
|
def cleanup_required(self):
|
||||||
|
for option in ['unwrap_lines', 'markup_chapter_headings', 'format_scene_breaks', 'delete_blank_paragraphs']:
|
||||||
|
if getattr(self.extra_opts, option, False):
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def __call__(self, html):
|
||||||
|
self.log.debug("********* Heuristic processing HTML *********")
|
||||||
|
|
||||||
|
# Count the words in the document to estimate how many chapters to look for and whether
|
||||||
|
# other types of processing are attempted
|
||||||
|
try:
|
||||||
|
self.totalwords = self.get_word_count(html)
|
||||||
|
except:
|
||||||
|
self.log.warn("Can't get wordcount")
|
||||||
|
|
||||||
|
if self.totalwords < 50:
|
||||||
|
self.log.warn("flow is too short, not running heuristics")
|
||||||
|
return html
|
||||||
|
|
||||||
|
# Arrange line feeds and </p> tags so the line_length and no_markup functions work correctly
|
||||||
|
html = self.arrange_htm_line_endings(html)
|
||||||
|
|
||||||
|
if self.cleanup_required():
|
||||||
|
###### Check Markup ######
|
||||||
|
#
|
||||||
|
# some lit files don't have any <p> tags or equivalent (generally just plain text between
|
||||||
|
# <pre> tags), check and mark up line endings if required before proceeding
|
||||||
|
# fix indents must run after this step
|
||||||
|
if self.no_markup(html, 0.1):
|
||||||
|
self.log.debug("not enough paragraph markers, adding now")
|
||||||
|
# markup using text processing
|
||||||
|
html = self.markup_pre(html)
|
||||||
|
|
||||||
|
# Replace series of non-breaking spaces with text-indent
|
||||||
|
if getattr(self.extra_opts, 'fix_indents', False):
|
||||||
|
html = self.fix_nbsp_indents(html)
|
||||||
|
|
||||||
|
if self.cleanup_required():
|
||||||
|
# fix indents must run before this step, as it removes non-breaking spaces
|
||||||
|
html = self.cleanup_markup(html)
|
||||||
|
|
||||||
|
# ADE doesn't render <br />, change to empty paragraphs
|
||||||
|
#html = re.sub('<br[^>]*>', u'<p>\u00a0</p>', html)
|
||||||
|
|
||||||
|
# Determine whether the document uses interleaved blank lines
|
||||||
|
blanks_between_paragraphs = self.analyze_blanks(html)
|
||||||
|
|
||||||
|
#self.dump(html, 'before_chapter_markup')
|
||||||
|
# detect chapters/sections to match xpath or splitting logic
|
||||||
|
|
||||||
|
if getattr(self.extra_opts, 'markup_chapter_headings', False):
|
||||||
|
html = self.markup_chapters(html, self.totalwords, blanks_between_paragraphs)
|
||||||
|
|
||||||
|
if getattr(self.extra_opts, 'italicize_common_cases', False):
|
||||||
|
html = self.markup_italicis(html)
|
||||||
|
|
||||||
|
# If more than 40% of the lines are empty paragraphs and the user has enabled delete
|
||||||
|
# blank paragraphs then delete blank lines to clean up spacing
|
||||||
|
if blanks_between_paragraphs and getattr(self.extra_opts, 'delete_blank_paragraphs', False):
|
||||||
|
self.log.debug("deleting blank lines")
|
||||||
|
self.blanks_deleted = True
|
||||||
|
html = self.multi_blank.sub('\n<p id="softbreak" style="margin-top:1.5em; margin-bottom:1.5em"> </p>', html)
|
||||||
|
html = self.blankreg.sub('', html)
|
||||||
|
|
||||||
|
# Determine line ending type
|
||||||
|
# Some OCR sourced files have line breaks in the html using a combination of span & p tags
|
||||||
|
# span are used for hard line breaks, p for new paragraphs. Determine which is used so
|
||||||
|
# that lines can be un-wrapped across page boundaries
|
||||||
|
format = self.analyze_line_endings(html)
|
||||||
|
|
||||||
# Check Line histogram to determine if the document uses hard line breaks, If 50% or
|
# Check Line histogram to determine if the document uses hard line breaks, If 50% or
|
||||||
# more of the lines break in the same region of the document then unwrapping is required
|
# more of the lines break in the same region of the document then unwrapping is required
|
||||||
docanalysis = DocAnalysis(format, html)
|
docanalysis = DocAnalysis(format, html)
|
||||||
hardbreaks = docanalysis.line_histogram(.50)
|
hardbreaks = docanalysis.line_histogram(.50)
|
||||||
self.log("Hard line breaks check returned "+unicode(hardbreaks))
|
self.log.debug("Hard line breaks check returned "+unicode(hardbreaks))
|
||||||
|
|
||||||
# Calculate Length
|
# Calculate Length
|
||||||
unwrap_factor = getattr(self.extra_opts, 'html_unwrap_factor', 0.4)
|
unwrap_factor = getattr(self.extra_opts, 'html_unwrap_factor', 0.4)
|
||||||
length = docanalysis.line_length(unwrap_factor)
|
length = docanalysis.line_length(unwrap_factor)
|
||||||
self.log("Median line length is " + unicode(length) + ", calculated with " + format + " format")
|
self.log.debug("Median line length is " + unicode(length) + ", calculated with " + format + " format")
|
||||||
|
|
||||||
|
###### Unwrap lines ######
|
||||||
|
if getattr(self.extra_opts, 'unwrap_lines', False):
|
||||||
# only go through unwrapping code if the histogram shows unwrapping is required or if the user decreased the default unwrap_factor
|
# only go through unwrapping code if the histogram shows unwrapping is required or if the user decreased the default unwrap_factor
|
||||||
if hardbreaks or unwrap_factor < 0.4:
|
if hardbreaks or unwrap_factor < 0.4:
|
||||||
self.log("Unwrapping required, unwrapping Lines")
|
self.log.debug("Unwrapping required, unwrapping Lines")
|
||||||
# Unwrap em/en dashes
|
# Dehyphenate with line length limiters
|
||||||
html = re.sub(u'(?<=.{%i}[\u2013\u2014])\s*(?=<)(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?=[[a-z\d])' % length, '', html)
|
dehyphenator = Dehyphenator(self.extra_opts.verbose, self.log)
|
||||||
# Dehyphenate
|
|
||||||
self.log("Unwrapping/Removing hyphens")
|
|
||||||
dehyphenator = Dehyphenator()
|
|
||||||
html = dehyphenator(html,'html', length)
|
html = dehyphenator(html,'html', length)
|
||||||
self.log("Done dehyphenating")
|
html = self.punctuation_unwrap(length, html, 'html')
|
||||||
# Unwrap lines using punctation and line length
|
|
||||||
#unwrap_quotes = re.compile(u"(?<=.{%i}\"')\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*(?=[a-z])" % length, re.UNICODE)
|
|
||||||
unwrap = re.compile(u"(?<=.{%i}([a-zäëïöüàèìòùáćéíóńśúâêîôûçąężıãõñæøþðß,:)\IA\u00DF]|(?<!\&\w{4});))\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE)
|
|
||||||
html = unwrap.sub(' ', html)
|
|
||||||
#check any remaining hyphens, but only unwrap if there is a match
|
|
||||||
dehyphenator = Dehyphenator()
|
|
||||||
html = dehyphenator(html,'html_cleanup', length)
|
|
||||||
else:
|
|
||||||
# dehyphenate in cleanup mode to fix anything previous conversions/editing missed
|
|
||||||
self.log("Cleaning up hyphenation")
|
|
||||||
dehyphenator = Dehyphenator()
|
|
||||||
html = dehyphenator(html,'html_cleanup', length)
|
|
||||||
self.log("Done dehyphenating")
|
|
||||||
|
|
||||||
# delete soft hyphens
|
if getattr(self.extra_opts, 'dehyphenate', False):
|
||||||
html = re.sub(u'\xad\s*(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*', '', html)
|
# dehyphenate in cleanup mode to fix anything previous conversions/editing missed
|
||||||
|
self.log.debug("Fixing hyphenated content")
|
||||||
|
dehyphenator = Dehyphenator(self.extra_opts.verbose, self.log)
|
||||||
|
html = dehyphenator(html,'html_cleanup', length)
|
||||||
|
html = dehyphenator(html, 'individual_words', length)
|
||||||
|
|
||||||
# If still no sections after unwrapping mark split points on lines with no punctuation
|
# If still no sections after unwrapping mark split points on lines with no punctuation
|
||||||
if self.html_preprocess_sections < self.min_chapters:
|
if self.html_preprocess_sections < self.min_chapters and getattr(self.extra_opts, 'markup_chapter_headings', False):
|
||||||
self.log("Looking for more split points based on punctuation,"
|
self.log.debug("Looking for more split points based on punctuation,"
|
||||||
" currently have " + unicode(self.html_preprocess_sections))
|
" currently have " + unicode(self.html_preprocess_sections))
|
||||||
chapdetect3 = re.compile(r'<(?P<styles>(p|div)[^>]*)>\s*(?P<section>(<span[^>]*>)?\s*(?!([*#•]+\s*)+)(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*.?(?=[a-z#\-*\s]+<)([a-z#-*]+\s*){1,5}\s*\s*(</span>)?(</[ibu]>){0,2}\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</span>)?\s*</(p|div)>)', re.IGNORECASE)
|
chapdetect3 = re.compile(r'<(?P<styles>(p|div)[^>]*)>\s*(?P<section>(<span[^>]*>)?\s*(?!([*#•]+\s*)+)(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*.?(?=[a-z#\-*\s]+<)([a-z#-*]+\s*){1,5}\s*\s*(</span>)?(</[ibu]>){0,2}\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</span>)?\s*</(p|div)>)', re.IGNORECASE)
|
||||||
html = chapdetect3.sub(self.chapter_break, html)
|
html = chapdetect3.sub(self.chapter_break, html)
|
||||||
|
|
||||||
|
if getattr(self.extra_opts, 'renumber_headings', False):
|
||||||
# search for places where a first or second level heading is immediately followed by another
|
# search for places where a first or second level heading is immediately followed by another
|
||||||
# top level heading. demote the second heading to h3 to prevent splitting between chapter
|
# top level heading. demote the second heading to h3 to prevent splitting between chapter
|
||||||
# headings and titles, images, etc
|
# headings and titles, images, etc
|
||||||
doubleheading = re.compile(r'(?P<firsthead><h(1|2)[^>]*>.+?</h(1|2)>\s*(<(?!h\d)[^>]*>\s*)*)<h(1|2)(?P<secondhead>[^>]*>.+?)</h(1|2)>', re.IGNORECASE)
|
doubleheading = re.compile(r'(?P<firsthead><h(1|2)[^>]*>.+?</h(1|2)>\s*(<(?!h\d)[^>]*>\s*)*)<h(1|2)(?P<secondhead>[^>]*>.+?)</h(1|2)>', re.IGNORECASE)
|
||||||
html = doubleheading.sub('\g<firsthead>'+'\n<h3'+'\g<secondhead>'+'</h3>', html)
|
html = doubleheading.sub('\g<firsthead>'+'\n<h3'+'\g<secondhead>'+'</h3>', html)
|
||||||
|
|
||||||
# put back non-breaking spaces in empty paragraphs to preserve original formatting
|
if getattr(self.extra_opts, 'format_scene_breaks', False):
|
||||||
html = blankreg.sub('\n'+r'\g<openline>'+u'\u00a0'+r'\g<closeline>', html)
|
|
||||||
|
|
||||||
# Center separator lines
|
# Center separator lines
|
||||||
html = re.sub(u'<p>\s*(?P<break>([*#•]+\s*)+)\s*</p>', '<p style="text-align:center">' + '\g<break>' + '</p>', html)
|
html = re.sub(u'<(?P<outer>p|div)[^>]*>\s*(<(?P<inner1>font|span|[ibu])[^>]*>)?\s*(<(?P<inner2>font|span|[ibu])[^>]*>)?\s*(<(?P<inner3>font|span|[ibu])[^>]*>)?\s*(?P<break>([*#•=✦]+\s*)+)\s*(</(?P=inner3)>)?\s*(</(?P=inner2)>)?\s*(</(?P=inner1)>)?\s*</(?P=outer)>', '<p style="text-align:center; margin-top:1.25em; margin-bottom:1.25em">' + '\g<break>' + '</p>', html)
|
||||||
|
if not self.blanks_deleted:
|
||||||
|
html = self.multi_blank.sub('\n<p id="softbreak" style="margin-top:1.5em; margin-bottom:1.5em"> </p>', html)
|
||||||
|
html = re.sub('<p\s+id="softbreak"[^>]*>\s*</p>', '<div id="softbreak" style="margin-left: 45%; margin-right: 45%; margin-top:1.5em; margin-bottom:1.5em"><hr style="height: 3px; background:#505050" /></div>', html)
|
||||||
|
|
||||||
|
if self.deleted_nbsps:
|
||||||
|
# put back non-breaking spaces in empty paragraphs to preserve original formatting
|
||||||
|
html = self.blankreg.sub('\n'+r'\g<openline>'+u'\u00a0'+r'\g<closeline>', html)
|
||||||
|
|
||||||
return html
|
return html
|
||||||
|
@ -16,7 +16,6 @@ import uuid
|
|||||||
|
|
||||||
from lxml import etree
|
from lxml import etree
|
||||||
|
|
||||||
from calibre import guess_type
|
|
||||||
from calibre import prepare_string_for_xml
|
from calibre import prepare_string_for_xml
|
||||||
from calibre.constants import __appname__, __version__
|
from calibre.constants import __appname__, __version__
|
||||||
from calibre.ebooks.oeb.base import XHTML, XHTML_NS, barename, namespace
|
from calibre.ebooks.oeb.base import XHTML, XHTML_NS, barename, namespace
|
||||||
@ -102,6 +101,7 @@ class FB2MLizer(object):
|
|||||||
metadata['date'] = '%i.%i.%i' % (datetime.now().day, datetime.now().month, datetime.now().year)
|
metadata['date'] = '%i.%i.%i' % (datetime.now().day, datetime.now().month, datetime.now().year)
|
||||||
metadata['lang'] = u''.join(self.oeb_book.metadata.lang) if self.oeb_book.metadata.lang else 'en'
|
metadata['lang'] = u''.join(self.oeb_book.metadata.lang) if self.oeb_book.metadata.lang else 'en'
|
||||||
metadata['id'] = None
|
metadata['id'] = None
|
||||||
|
metadata['cover'] = self.get_cover()
|
||||||
|
|
||||||
author_parts = self.oeb_book.metadata.creator[0].value.split(' ')
|
author_parts = self.oeb_book.metadata.creator[0].value.split(' ')
|
||||||
if len(author_parts) == 1:
|
if len(author_parts) == 1:
|
||||||
@ -124,6 +124,7 @@ class FB2MLizer(object):
|
|||||||
metadata['id'] = str(uuid.uuid4())
|
metadata['id'] = str(uuid.uuid4())
|
||||||
|
|
||||||
for key, value in metadata.items():
|
for key, value in metadata.items():
|
||||||
|
if not key == 'cover':
|
||||||
metadata[key] = prepare_string_for_xml(value)
|
metadata[key] = prepare_string_for_xml(value)
|
||||||
|
|
||||||
return u'<FictionBook xmlns="http://www.gribuser.ru/xml/fictionbook/2.0" xmlns:xlink="http://www.w3.org/1999/xlink">' \
|
return u'<FictionBook xmlns="http://www.gribuser.ru/xml/fictionbook/2.0" xmlns:xlink="http://www.w3.org/1999/xlink">' \
|
||||||
@ -136,6 +137,7 @@ class FB2MLizer(object):
|
|||||||
'<last-name>%(author_last)s</last-name>' \
|
'<last-name>%(author_last)s</last-name>' \
|
||||||
'</author>' \
|
'</author>' \
|
||||||
'<book-title>%(title)s</book-title>' \
|
'<book-title>%(title)s</book-title>' \
|
||||||
|
'%(cover)s' \
|
||||||
'<lang>%(lang)s</lang>' \
|
'<lang>%(lang)s</lang>' \
|
||||||
'</title-info>' \
|
'</title-info>' \
|
||||||
'<document-info>' \
|
'<document-info>' \
|
||||||
@ -154,6 +156,39 @@ class FB2MLizer(object):
|
|||||||
def fb2_footer(self):
|
def fb2_footer(self):
|
||||||
return u'</FictionBook>'
|
return u'</FictionBook>'
|
||||||
|
|
||||||
|
def get_cover(self):
|
||||||
|
cover_href = None
|
||||||
|
|
||||||
|
# Get the raster cover if it's available.
|
||||||
|
if self.oeb_book.metadata.cover and unicode(self.oeb_book.metadata.cover[0]) in self.oeb_book.manifest.ids:
|
||||||
|
id = unicode(self.oeb_book.metadata.cover[0])
|
||||||
|
cover_item = self.oeb_book.manifest.ids[id]
|
||||||
|
if cover_item.media_type in OEB_RASTER_IMAGES:
|
||||||
|
cover_href = cover_item.href
|
||||||
|
else:
|
||||||
|
# Figure out if we have a title page or a cover page
|
||||||
|
page_name = ''
|
||||||
|
if 'titlepage' in self.oeb_book.guide:
|
||||||
|
page_name = 'titlepage'
|
||||||
|
elif 'cover' in self.oeb_book.guide:
|
||||||
|
page_name = 'cover'
|
||||||
|
|
||||||
|
if page_name:
|
||||||
|
cover_item = self.oeb_book.manifest.hrefs[self.oeb_book.guide[page_name].href]
|
||||||
|
# Get the first image in the page
|
||||||
|
for img in cover_item.xpath('//img'):
|
||||||
|
cover_href = cover_item.abshref(img.get('src'))
|
||||||
|
break
|
||||||
|
|
||||||
|
if cover_href:
|
||||||
|
# Only write the image tag if it is in the manifest.
|
||||||
|
if cover_href in self.oeb_book.manifest.hrefs.keys():
|
||||||
|
if cover_href not in self.image_hrefs.keys():
|
||||||
|
self.image_hrefs[cover_href] = '_%s.jpg' % len(self.image_hrefs.keys())
|
||||||
|
return u'<coverpage><image xlink:href="#%s" /></coverpage>' % self.image_hrefs[cover_href]
|
||||||
|
|
||||||
|
return u''
|
||||||
|
|
||||||
def get_text(self):
|
def get_text(self):
|
||||||
text = ['<body>']
|
text = ['<body>']
|
||||||
|
|
||||||
@ -162,23 +197,6 @@ class FB2MLizer(object):
|
|||||||
text.append('<section>')
|
text.append('<section>')
|
||||||
self.section_level += 1
|
self.section_level += 1
|
||||||
|
|
||||||
# Insert the title page / cover into the spine if it is not already referenced.
|
|
||||||
title_name = u''
|
|
||||||
if 'titlepage' in self.oeb_book.guide:
|
|
||||||
title_name = 'titlepage'
|
|
||||||
elif 'cover' in self.oeb_book.guide:
|
|
||||||
title_name = 'cover'
|
|
||||||
if title_name:
|
|
||||||
title_item = self.oeb_book.manifest.hrefs[self.oeb_book.guide[title_name].href]
|
|
||||||
if title_item.spine_position is None and title_item.media_type == 'application/xhtml+xml':
|
|
||||||
self.oeb_book.spine.insert(0, title_item, True)
|
|
||||||
# Create xhtml page to reference cover image so it can be used.
|
|
||||||
if not title_name and self.oeb_book.metadata.cover and unicode(self.oeb_book.metadata.cover[0]) in self.oeb_book.manifest.ids:
|
|
||||||
id = unicode(self.oeb_book.metadata.cover[0])
|
|
||||||
cover_item = self.oeb_book.manifest.ids[id]
|
|
||||||
if cover_item.media_type in OEB_RASTER_IMAGES:
|
|
||||||
self.insert_image_cover(cover_item.href)
|
|
||||||
|
|
||||||
for item in self.oeb_book.spine:
|
for item in self.oeb_book.spine:
|
||||||
self.log.debug('Converting %s to FictionBook2 XML' % item.href)
|
self.log.debug('Converting %s to FictionBook2 XML' % item.href)
|
||||||
stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts, self.opts.output_profile)
|
stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts, self.opts.output_profile)
|
||||||
@ -203,17 +221,6 @@ class FB2MLizer(object):
|
|||||||
|
|
||||||
return ''.join(text) + '</body>'
|
return ''.join(text) + '</body>'
|
||||||
|
|
||||||
def insert_image_cover(self, image_href):
|
|
||||||
from calibre.ebooks.oeb.base import RECOVER_PARSER
|
|
||||||
try:
|
|
||||||
root = etree.fromstring(u'<html xmlns="%s"><body><img src="%s" /></body></html>' % (XHTML_NS, image_href), parser=RECOVER_PARSER)
|
|
||||||
except:
|
|
||||||
root = etree.fromstring(u'', parser=RECOVER_PARSER)
|
|
||||||
|
|
||||||
id, href = self.oeb_book.manifest.generate('fb2_cover', 'fb2_cover.xhtml')
|
|
||||||
item = self.oeb_book.manifest.add(id, href, guess_type(href)[0], data=root)
|
|
||||||
self.oeb_book.spine.insert(0, item, True)
|
|
||||||
|
|
||||||
def fb2mlize_images(self):
|
def fb2mlize_images(self):
|
||||||
'''
|
'''
|
||||||
This function uses the self.image_hrefs dictionary mapping. It is populated by the dump_text function.
|
This function uses the self.image_hrefs dictionary mapping. It is populated by the dump_text function.
|
||||||
|
@ -46,15 +46,19 @@ class FB2Input(InputFormatPlugin):
|
|||||||
log.debug('Parsing XML...')
|
log.debug('Parsing XML...')
|
||||||
raw = stream.read().replace('\0', '')
|
raw = stream.read().replace('\0', '')
|
||||||
raw = xml_to_unicode(raw, strip_encoding_pats=True,
|
raw = xml_to_unicode(raw, strip_encoding_pats=True,
|
||||||
assume_utf8=True)[0]
|
assume_utf8=True, resolve_entities=True)[0]
|
||||||
try:
|
try:
|
||||||
doc = etree.fromstring(raw)
|
doc = etree.fromstring(raw)
|
||||||
except etree.XMLSyntaxError:
|
except etree.XMLSyntaxError:
|
||||||
try:
|
try:
|
||||||
doc = etree.fromstring(raw, parser=RECOVER_PARSER)
|
doc = etree.fromstring(raw, parser=RECOVER_PARSER)
|
||||||
|
if doc is None:
|
||||||
|
raise Exception('parse failed')
|
||||||
except:
|
except:
|
||||||
doc = etree.fromstring(raw.replace('& ', '&'),
|
doc = etree.fromstring(raw.replace('& ', '&'),
|
||||||
parser=RECOVER_PARSER)
|
parser=RECOVER_PARSER)
|
||||||
|
if doc is None:
|
||||||
|
raise ValueError('The FB2 file is not valid XML')
|
||||||
stylesheets = doc.xpath('//*[local-name() = "stylesheet" and @type="text/css"]')
|
stylesheets = doc.xpath('//*[local-name() = "stylesheet" and @type="text/css"]')
|
||||||
css = ''
|
css = ''
|
||||||
for s in stylesheets:
|
for s in stylesheets:
|
||||||
@ -100,7 +104,11 @@ class FB2Input(InputFormatPlugin):
|
|||||||
entries = [(f, guess_type(f)[0]) for f in os.listdir('.')]
|
entries = [(f, guess_type(f)[0]) for f in os.listdir('.')]
|
||||||
opf.create_manifest(entries)
|
opf.create_manifest(entries)
|
||||||
opf.create_spine(['index.xhtml'])
|
opf.create_spine(['index.xhtml'])
|
||||||
|
if mi.cover_data and mi.cover_data[1]:
|
||||||
|
with open('fb2_cover_calibre_mi.jpg', 'wb') as f:
|
||||||
|
f.write(mi.cover_data[1])
|
||||||
|
opf.guide.set_cover(os.path.abspath('fb2_cover_calibre_mi.jpg'))
|
||||||
|
else:
|
||||||
for img in doc.xpath('//f:coverpage/f:image', namespaces=NAMESPACES):
|
for img in doc.xpath('//f:coverpage/f:image', namespaces=NAMESPACES):
|
||||||
href = img.get('{%s}href'%XLINK_NS, img.get('href', None))
|
href = img.get('{%s}href'%XLINK_NS, img.get('href', None))
|
||||||
if href is not None:
|
if href is not None:
|
||||||
|
@ -21,10 +21,9 @@ from calibre.customize.conversion import InputFormatPlugin
|
|||||||
from calibre.ebooks.chardet import xml_to_unicode
|
from calibre.ebooks.chardet import xml_to_unicode
|
||||||
from calibre.customize.conversion import OptionRecommendation
|
from calibre.customize.conversion import OptionRecommendation
|
||||||
from calibre.constants import islinux, isfreebsd, iswindows
|
from calibre.constants import islinux, isfreebsd, iswindows
|
||||||
from calibre import unicode_path
|
from calibre import unicode_path, as_unicode
|
||||||
from calibre.utils.localization import get_lang
|
from calibre.utils.localization import get_lang
|
||||||
from calibre.utils.filenames import ascii_filename
|
from calibre.utils.filenames import ascii_filename
|
||||||
from calibre.ebooks.conversion.utils import PreProcessor
|
|
||||||
|
|
||||||
class Link(object):
|
class Link(object):
|
||||||
'''
|
'''
|
||||||
@ -112,14 +111,14 @@ class HTMLFile(object):
|
|||||||
with open(self.path, 'rb') as f:
|
with open(self.path, 'rb') as f:
|
||||||
src = f.read()
|
src = f.read()
|
||||||
except IOError, err:
|
except IOError, err:
|
||||||
msg = 'Could not read from file: %s with error: %s'%(self.path, unicode(err))
|
msg = 'Could not read from file: %s with error: %s'%(self.path, as_unicode(err))
|
||||||
if level == 0:
|
if level == 0:
|
||||||
raise IOError(msg)
|
raise IOError(msg)
|
||||||
raise IgnoreFile(msg, err.errno)
|
raise IgnoreFile(msg, err.errno)
|
||||||
|
|
||||||
self.is_binary = level > 0 and not bool(self.HTML_PAT.search(src[:4096]))
|
self.is_binary = level > 0 and not bool(self.HTML_PAT.search(src[:4096]))
|
||||||
if not self.is_binary:
|
if not self.is_binary:
|
||||||
if encoding is None:
|
if not encoding:
|
||||||
encoding = xml_to_unicode(src[:4096], verbose=verbose)[-1]
|
encoding = xml_to_unicode(src[:4096], verbose=verbose)[-1]
|
||||||
self.encoding = encoding
|
self.encoding = encoding
|
||||||
else:
|
else:
|
||||||
@ -296,7 +295,7 @@ class HTMLInput(InputFormatPlugin):
|
|||||||
return oeb
|
return oeb
|
||||||
|
|
||||||
from calibre.ebooks.conversion.plumber import create_oebbook
|
from calibre.ebooks.conversion.plumber import create_oebbook
|
||||||
return create_oebbook(log, stream.name, opts, self,
|
return create_oebbook(log, stream.name, opts,
|
||||||
encoding=opts.input_encoding)
|
encoding=opts.input_encoding)
|
||||||
|
|
||||||
def is_case_sensitive(self, path):
|
def is_case_sensitive(self, path):
|
||||||
@ -485,9 +484,3 @@ class HTMLInput(InputFormatPlugin):
|
|||||||
self.log.exception('Failed to read CSS file: %r'%link)
|
self.log.exception('Failed to read CSS file: %r'%link)
|
||||||
return (None, None)
|
return (None, None)
|
||||||
return (None, raw)
|
return (None, raw)
|
||||||
|
|
||||||
def preprocess_html(self, options, html):
|
|
||||||
self.options = options
|
|
||||||
preprocessor = PreProcessor(self.options, log=getattr(self, 'log', None))
|
|
||||||
return preprocessor(html)
|
|
||||||
|
|
||||||
|
@ -7,8 +7,6 @@ __copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
|
|||||||
__docformat__ = 'restructuredtext en'
|
__docformat__ = 'restructuredtext en'
|
||||||
|
|
||||||
from calibre.customize.conversion import InputFormatPlugin
|
from calibre.customize.conversion import InputFormatPlugin
|
||||||
from calibre.ebooks.conversion.utils import PreProcessor
|
|
||||||
|
|
||||||
|
|
||||||
class LITInput(InputFormatPlugin):
|
class LITInput(InputFormatPlugin):
|
||||||
|
|
||||||
@ -22,7 +20,7 @@ class LITInput(InputFormatPlugin):
|
|||||||
from calibre.ebooks.lit.reader import LitReader
|
from calibre.ebooks.lit.reader import LitReader
|
||||||
from calibre.ebooks.conversion.plumber import create_oebbook
|
from calibre.ebooks.conversion.plumber import create_oebbook
|
||||||
self.log = log
|
self.log = log
|
||||||
return create_oebbook(log, stream, options, self, reader=LitReader)
|
return create_oebbook(log, stream, options, reader=LitReader)
|
||||||
|
|
||||||
def postprocess_book(self, oeb, opts, log):
|
def postprocess_book(self, oeb, opts, log):
|
||||||
from calibre.ebooks.oeb.base import XHTML_NS, XPath, XHTML
|
from calibre.ebooks.oeb.base import XHTML_NS, XPath, XHTML
|
||||||
@ -39,10 +37,13 @@ class LITInput(InputFormatPlugin):
|
|||||||
body = body[0]
|
body = body[0]
|
||||||
if len(body) == 1 and body[0].tag == XHTML('pre'):
|
if len(body) == 1 and body[0].tag == XHTML('pre'):
|
||||||
pre = body[0]
|
pre = body[0]
|
||||||
from calibre.ebooks.txt.processor import convert_basic
|
from calibre.ebooks.txt.processor import convert_basic, preserve_spaces, \
|
||||||
|
separate_paragraphs_single_line
|
||||||
from lxml import etree
|
from lxml import etree
|
||||||
import copy
|
import copy
|
||||||
html = convert_basic(pre.text).replace('<html>',
|
html = separate_paragraphs_single_line(pre.text)
|
||||||
|
html = preserve_spaces(html)
|
||||||
|
html = convert_basic(html).replace('<html>',
|
||||||
'<html xmlns="%s">'%XHTML_NS)
|
'<html xmlns="%s">'%XHTML_NS)
|
||||||
root = etree.fromstring(html)
|
root = etree.fromstring(html)
|
||||||
body = XPath('//h:body')(root)
|
body = XPath('//h:body')(root)
|
||||||
@ -51,10 +52,3 @@ class LITInput(InputFormatPlugin):
|
|||||||
for elem in body:
|
for elem in body:
|
||||||
ne = copy.deepcopy(elem)
|
ne = copy.deepcopy(elem)
|
||||||
pre.append(ne)
|
pre.append(ne)
|
||||||
|
|
||||||
|
|
||||||
def preprocess_html(self, options, html):
|
|
||||||
self.options = options
|
|
||||||
preprocessor = PreProcessor(self.options, log=getattr(self, 'log', None))
|
|
||||||
return preprocessor(html)
|
|
||||||
|
|
||||||
|
@ -12,7 +12,6 @@ from copy import deepcopy
|
|||||||
from lxml import etree
|
from lxml import etree
|
||||||
|
|
||||||
from calibre.customize.conversion import InputFormatPlugin
|
from calibre.customize.conversion import InputFormatPlugin
|
||||||
from calibre.ebooks.conversion.utils import PreProcessor
|
|
||||||
from calibre import guess_type
|
from calibre import guess_type
|
||||||
|
|
||||||
class Canvas(etree.XSLTExtension):
|
class Canvas(etree.XSLTExtension):
|
||||||
@ -419,11 +418,3 @@ class LRFInput(InputFormatPlugin):
|
|||||||
f.write(result)
|
f.write(result)
|
||||||
styles.write()
|
styles.write()
|
||||||
return os.path.abspath('content.opf')
|
return os.path.abspath('content.opf')
|
||||||
|
|
||||||
def preprocess_html(self, options, html):
|
|
||||||
self.options = options
|
|
||||||
preprocessor = PreProcessor(self.options, log=getattr(self, 'log', None))
|
|
||||||
return preprocessor(html)
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -324,14 +324,16 @@ class Metadata(object):
|
|||||||
if metadata is None:
|
if metadata is None:
|
||||||
traceback.print_stack()
|
traceback.print_stack()
|
||||||
return
|
return
|
||||||
metadata = copy.deepcopy(metadata)
|
m = {}
|
||||||
if '#value#' not in metadata:
|
for k in metadata:
|
||||||
if metadata['datatype'] == 'text' and metadata['is_multiple']:
|
m[k] = copy.copy(metadata[k])
|
||||||
metadata['#value#'] = []
|
if '#value#' not in m:
|
||||||
|
if m['datatype'] == 'text' and m['is_multiple']:
|
||||||
|
m['#value#'] = []
|
||||||
else:
|
else:
|
||||||
metadata['#value#'] = None
|
m['#value#'] = None
|
||||||
_data = object.__getattribute__(self, '_data')
|
_data = object.__getattribute__(self, '_data')
|
||||||
_data['user_metadata'][field] = metadata
|
_data['user_metadata'][field] = m
|
||||||
|
|
||||||
def template_to_attribute(self, other, ops):
|
def template_to_attribute(self, other, ops):
|
||||||
'''
|
'''
|
||||||
|
@ -4,7 +4,7 @@ __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
|
|||||||
Fetch cover from LibraryThing.com based on ISBN number.
|
Fetch cover from LibraryThing.com based on ISBN number.
|
||||||
'''
|
'''
|
||||||
|
|
||||||
import sys, socket, os, re
|
import sys, socket, os, re, random
|
||||||
|
|
||||||
from lxml import html
|
from lxml import html
|
||||||
import mechanize
|
import mechanize
|
||||||
@ -16,13 +16,26 @@ from calibre.ebooks.chardet import strip_encoding_declarations
|
|||||||
|
|
||||||
OPENLIBRARY = 'http://covers.openlibrary.org/b/isbn/%s-L.jpg?default=false'
|
OPENLIBRARY = 'http://covers.openlibrary.org/b/isbn/%s-L.jpg?default=false'
|
||||||
|
|
||||||
|
def get_ua():
|
||||||
|
choices = [
|
||||||
|
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.9.2.11) Gecko/20101012 Firefox/3.6.11'
|
||||||
|
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)'
|
||||||
|
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)'
|
||||||
|
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1)'
|
||||||
|
'Mozilla/5.0 (iPhone; U; CPU iPhone OS 3_0 like Mac OS X; en-us) AppleWebKit/528.18 (KHTML, like Gecko) Version/4.0 Mobile/7A341 Safari/528.16'
|
||||||
|
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/0.2.153.1 Safari/525.19'
|
||||||
|
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.9.2.11) Gecko/20101012 Firefox/3.6.11'
|
||||||
|
]
|
||||||
|
return choices[random.randint(0, len(choices)-1)]
|
||||||
|
|
||||||
|
|
||||||
class HeadRequest(mechanize.Request):
|
class HeadRequest(mechanize.Request):
|
||||||
|
|
||||||
def get_method(self):
|
def get_method(self):
|
||||||
return 'HEAD'
|
return 'HEAD'
|
||||||
|
|
||||||
def check_for_cover(isbn, timeout=5.):
|
def check_for_cover(isbn, timeout=5.):
|
||||||
br = browser()
|
br = browser(user_agent=get_ua())
|
||||||
br.set_handle_redirect(False)
|
br.set_handle_redirect(False)
|
||||||
try:
|
try:
|
||||||
br.open_novisit(HeadRequest(OPENLIBRARY%isbn), timeout=timeout)
|
br.open_novisit(HeadRequest(OPENLIBRARY%isbn), timeout=timeout)
|
||||||
@ -51,7 +64,7 @@ def login(br, username, password, force=True):
|
|||||||
|
|
||||||
def cover_from_isbn(isbn, timeout=5., username=None, password=None):
|
def cover_from_isbn(isbn, timeout=5., username=None, password=None):
|
||||||
src = None
|
src = None
|
||||||
br = browser()
|
br = browser(user_agent=get_ua())
|
||||||
try:
|
try:
|
||||||
return br.open(OPENLIBRARY%isbn, timeout=timeout).read(), 'jpg'
|
return br.open(OPENLIBRARY%isbn, timeout=timeout).read(), 'jpg'
|
||||||
except:
|
except:
|
||||||
@ -100,7 +113,7 @@ def get_social_metadata(title, authors, publisher, isbn, username=None,
|
|||||||
from calibre.ebooks.metadata import MetaInformation
|
from calibre.ebooks.metadata import MetaInformation
|
||||||
mi = MetaInformation(title, authors)
|
mi = MetaInformation(title, authors)
|
||||||
if isbn:
|
if isbn:
|
||||||
br = browser()
|
br = browser(user_agent=get_ua())
|
||||||
if username and password:
|
if username and password:
|
||||||
try:
|
try:
|
||||||
login(br, username, password, force=False)
|
login(br, username, password, force=False)
|
||||||
|
@ -10,7 +10,8 @@ from calibre.ebooks.metadata import MetaInformation, string_to_authors
|
|||||||
title_pat = re.compile(r'\{\\info.*?\{\\title(.*?)(?<!\\)\}', re.DOTALL)
|
title_pat = re.compile(r'\{\\info.*?\{\\title(.*?)(?<!\\)\}', re.DOTALL)
|
||||||
author_pat = re.compile(r'\{\\info.*?\{\\author(.*?)(?<!\\)\}', re.DOTALL)
|
author_pat = re.compile(r'\{\\info.*?\{\\author(.*?)(?<!\\)\}', re.DOTALL)
|
||||||
comment_pat = re.compile(r'\{\\info.*?\{\\subject(.*?)(?<!\\)\}', re.DOTALL)
|
comment_pat = re.compile(r'\{\\info.*?\{\\subject(.*?)(?<!\\)\}', re.DOTALL)
|
||||||
category_pat = re.compile(r'\{\\info.*?\{\\category(.*?)(?<!\\)\}', re.DOTALL)
|
tags_pat = re.compile(r'\{\\info.*?\{\\category(.*?)(?<!\\)\}', re.DOTALL)
|
||||||
|
publisher_pat = re.compile(r'\{\\info.*?\{\\manager(.*?)(?<!\\)\}', re.DOTALL)
|
||||||
|
|
||||||
def get_document_info(stream):
|
def get_document_info(stream):
|
||||||
"""
|
"""
|
||||||
@ -82,61 +83,73 @@ def decode(raw, codec):
|
|||||||
|
|
||||||
def get_metadata(stream):
|
def get_metadata(stream):
|
||||||
""" Return metadata as a L{MetaInfo} object """
|
""" Return metadata as a L{MetaInfo} object """
|
||||||
title, author, comment, category = None, None, None, None
|
|
||||||
stream.seek(0)
|
stream.seek(0)
|
||||||
if stream.read(5) != r'{\rtf':
|
if stream.read(5) != r'{\rtf':
|
||||||
return MetaInformation(None, None)
|
return MetaInformation(_('Unknown'))
|
||||||
block = get_document_info(stream)[0]
|
block = get_document_info(stream)[0]
|
||||||
if not block:
|
if not block:
|
||||||
return MetaInformation(None, None)
|
return MetaInformation(_('Unknown'))
|
||||||
|
|
||||||
stream.seek(0)
|
stream.seek(0)
|
||||||
cpg = detect_codepage(stream)
|
cpg = detect_codepage(stream)
|
||||||
stream.seek(0)
|
stream.seek(0)
|
||||||
|
|
||||||
title_match = title_pat.search(block)
|
title_match = title_pat.search(block)
|
||||||
if title_match:
|
if title_match is not None:
|
||||||
title = decode(title_match.group(1).strip(), cpg)
|
title = decode(title_match.group(1).strip(), cpg)
|
||||||
|
else:
|
||||||
|
title = _('Unknown')
|
||||||
author_match = author_pat.search(block)
|
author_match = author_pat.search(block)
|
||||||
if author_match:
|
if author_match is not None:
|
||||||
author = decode(author_match.group(1).strip(), cpg)
|
author = decode(author_match.group(1).strip(), cpg)
|
||||||
comment_match = comment_pat.search(block)
|
else:
|
||||||
if comment_match:
|
author = None
|
||||||
comment = decode(comment_match.group(1).strip(), cpg)
|
mi = MetaInformation(title)
|
||||||
category_match = category_pat.search(block)
|
|
||||||
if category_match:
|
|
||||||
category = decode(category_match.group(1).strip(), cpg)
|
|
||||||
mi = MetaInformation(title, author)
|
|
||||||
if author:
|
if author:
|
||||||
mi.authors = string_to_authors(author)
|
mi.authors = string_to_authors(author)
|
||||||
|
|
||||||
|
comment_match = comment_pat.search(block)
|
||||||
|
if comment_match is not None:
|
||||||
|
comment = decode(comment_match.group(1).strip(), cpg)
|
||||||
mi.comments = comment
|
mi.comments = comment
|
||||||
mi.category = category
|
tags_match = tags_pat.search(block)
|
||||||
|
if tags_match is not None:
|
||||||
|
tags = decode(tags_match.group(1).strip(), cpg)
|
||||||
|
mi.tags = tags
|
||||||
|
publisher_match = publisher_pat.search(block)
|
||||||
|
if publisher_match is not None:
|
||||||
|
publisher = decode(publisher_match.group(1).strip(), cpg)
|
||||||
|
mi.publisher = publisher
|
||||||
|
|
||||||
return mi
|
return mi
|
||||||
|
|
||||||
|
|
||||||
def create_metadata(stream, options):
|
def create_metadata(stream, options):
|
||||||
md = r'{\info'
|
md = [r'{\info']
|
||||||
if options.title:
|
if options.title:
|
||||||
title = options.title.encode('ascii', 'ignore')
|
title = options.title.encode('ascii', 'ignore')
|
||||||
md += r'{\title %s}'%(title,)
|
md.append(r'{\title %s}'%(title,))
|
||||||
if options.authors:
|
if options.authors:
|
||||||
au = options.authors
|
au = options.authors
|
||||||
if not isinstance(au, basestring):
|
if not isinstance(au, basestring):
|
||||||
au = u', '.join(au)
|
au = u', '.join(au)
|
||||||
author = au.encode('ascii', 'ignore')
|
author = au.encode('ascii', 'ignore')
|
||||||
md += r'{\author %s}'%(author,)
|
md.append(r'{\author %s}'%(author,))
|
||||||
if options.get('category', None):
|
|
||||||
category = options.category.encode('ascii', 'ignore')
|
|
||||||
md += r'{\category %s}'%(category,)
|
|
||||||
comp = options.comment if hasattr(options, 'comment') else options.comments
|
comp = options.comment if hasattr(options, 'comment') else options.comments
|
||||||
if comp:
|
if comp:
|
||||||
comment = comp.encode('ascii', 'ignore')
|
comment = comp.encode('ascii', 'ignore')
|
||||||
md += r'{\subject %s}'%(comment,)
|
md.append(r'{\subject %s}'%(comment,))
|
||||||
if len(md) > 6:
|
if options.publisher:
|
||||||
md += '}'
|
publisher = options.publisher.encode('ascii', 'ignore')
|
||||||
|
md.append(r'{\manager %s}'%(publisher,))
|
||||||
|
if options.tags:
|
||||||
|
tags = u', '.join(options.tags)
|
||||||
|
tags = tags.encode('ascii', 'ignore')
|
||||||
|
md.append(r'{\category %s}'%(tags,))
|
||||||
|
if len(md) > 1:
|
||||||
|
md.append('}')
|
||||||
stream.seek(0)
|
stream.seek(0)
|
||||||
src = stream.read()
|
src = stream.read()
|
||||||
ans = src[:6] + md + src[6:]
|
ans = src[:6] + u''.join(md) + src[6:]
|
||||||
stream.seek(0)
|
stream.seek(0)
|
||||||
stream.write(ans)
|
stream.write(ans)
|
||||||
|
|
||||||
@ -156,7 +169,7 @@ def set_metadata(stream, options):
|
|||||||
|
|
||||||
base_pat = r'\{\\name(.*?)(?<!\\)\}'
|
base_pat = r'\{\\name(.*?)(?<!\\)\}'
|
||||||
title = options.title
|
title = options.title
|
||||||
if title != None:
|
if title is not None:
|
||||||
title = title.encode('ascii', 'replace')
|
title = title.encode('ascii', 'replace')
|
||||||
pat = re.compile(base_pat.replace('name', 'title'), re.DOTALL)
|
pat = re.compile(base_pat.replace('name', 'title'), re.DOTALL)
|
||||||
if pat.search(src):
|
if pat.search(src):
|
||||||
@ -164,7 +177,7 @@ def set_metadata(stream, options):
|
|||||||
else:
|
else:
|
||||||
src = add_metadata_item(src, 'title', title)
|
src = add_metadata_item(src, 'title', title)
|
||||||
comment = options.comments
|
comment = options.comments
|
||||||
if comment != None:
|
if comment is not None:
|
||||||
comment = comment.encode('ascii', 'replace')
|
comment = comment.encode('ascii', 'replace')
|
||||||
pat = re.compile(base_pat.replace('name', 'subject'), re.DOTALL)
|
pat = re.compile(base_pat.replace('name', 'subject'), re.DOTALL)
|
||||||
if pat.search(src):
|
if pat.search(src):
|
||||||
@ -172,7 +185,7 @@ def set_metadata(stream, options):
|
|||||||
else:
|
else:
|
||||||
src = add_metadata_item(src, 'subject', comment)
|
src = add_metadata_item(src, 'subject', comment)
|
||||||
author = options.authors
|
author = options.authors
|
||||||
if author != None:
|
if author is not None:
|
||||||
author = ', '.join(author)
|
author = ', '.join(author)
|
||||||
author = author.encode('ascii', 'ignore')
|
author = author.encode('ascii', 'ignore')
|
||||||
pat = re.compile(base_pat.replace('name', 'author'), re.DOTALL)
|
pat = re.compile(base_pat.replace('name', 'author'), re.DOTALL)
|
||||||
@ -180,14 +193,23 @@ def set_metadata(stream, options):
|
|||||||
src = pat.sub(r'{\\author ' + author + r'}', src)
|
src = pat.sub(r'{\\author ' + author + r'}', src)
|
||||||
else:
|
else:
|
||||||
src = add_metadata_item(src, 'author', author)
|
src = add_metadata_item(src, 'author', author)
|
||||||
category = options.get('category', None)
|
tags = options.tags
|
||||||
if category != None:
|
if tags is not None:
|
||||||
category = category.encode('ascii', 'replace')
|
tags = ', '.join(tags)
|
||||||
|
tags = tags.encode('ascii', 'replace')
|
||||||
pat = re.compile(base_pat.replace('name', 'category'), re.DOTALL)
|
pat = re.compile(base_pat.replace('name', 'category'), re.DOTALL)
|
||||||
if pat.search(src):
|
if pat.search(src):
|
||||||
src = pat.sub(r'{\\category ' + category + r'}', src)
|
src = pat.sub(r'{\\category ' + tags + r'}', src)
|
||||||
else:
|
else:
|
||||||
src = add_metadata_item(src, 'category', category)
|
src = add_metadata_item(src, 'category', tags)
|
||||||
|
publisher = options.publisher
|
||||||
|
if publisher is not None:
|
||||||
|
publisher = publisher.encode('ascii', 'replace')
|
||||||
|
pat = re.compile(base_pat.replace('name', 'manager'), re.DOTALL)
|
||||||
|
if pat.search(src):
|
||||||
|
src = pat.sub(r'{\\manager ' + publisher + r'}', src)
|
||||||
|
else:
|
||||||
|
src = add_metadata_item(src, 'manager', publisher)
|
||||||
stream.seek(pos + olen)
|
stream.seek(pos + olen)
|
||||||
after = stream.read()
|
after = stream.read()
|
||||||
stream.seek(pos)
|
stream.seek(pos)
|
||||||
|
@ -3,7 +3,6 @@ __license__ = 'GPL 3'
|
|||||||
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
|
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||||
__docformat__ = 'restructuredtext en'
|
__docformat__ = 'restructuredtext en'
|
||||||
|
|
||||||
import re
|
|
||||||
from calibre.customize.conversion import InputFormatPlugin
|
from calibre.customize.conversion import InputFormatPlugin
|
||||||
|
|
||||||
class MOBIInput(InputFormatPlugin):
|
class MOBIInput(InputFormatPlugin):
|
||||||
@ -39,11 +38,3 @@ class MOBIInput(InputFormatPlugin):
|
|||||||
accelerators['pagebreaks'] = '//h:div[@class="mbp_pagebreak"]'
|
accelerators['pagebreaks'] = '//h:div[@class="mbp_pagebreak"]'
|
||||||
return mr.created_opf_path
|
return mr.created_opf_path
|
||||||
|
|
||||||
def preprocess_html(self, options, html):
|
|
||||||
# search for places where a first or second level heading is immediately followed by another
|
|
||||||
# top level heading. demote the second heading to h3 to prevent splitting between chapter
|
|
||||||
# headings and titles, images, etc
|
|
||||||
doubleheading = re.compile(r'(?P<firsthead><h(1|2)[^>]*>.+?</h(1|2)>\s*(<(?!h\d)[^>]*>\s*)*)<h(1|2)(?P<secondhead>[^>]*>.+?)</h(1|2)>', re.IGNORECASE)
|
|
||||||
html = doubleheading.sub('\g<firsthead>'+'\n<h3'+'\g<secondhead>'+'</h3>', html)
|
|
||||||
return html
|
|
||||||
|
|
||||||
|
@ -139,7 +139,7 @@ class BookHeader(object):
|
|||||||
65001: 'utf-8',
|
65001: 'utf-8',
|
||||||
}[self.codepage]
|
}[self.codepage]
|
||||||
except (IndexError, KeyError):
|
except (IndexError, KeyError):
|
||||||
self.codec = 'cp1252' if user_encoding is None else user_encoding
|
self.codec = 'cp1252' if not user_encoding else user_encoding
|
||||||
log.warn('Unknown codepage %d. Assuming %s' % (self.codepage,
|
log.warn('Unknown codepage %d. Assuming %s' % (self.codepage,
|
||||||
self.codec))
|
self.codec))
|
||||||
if ident == 'TEXTREAD' or self.length < 0xE4 or 0xE8 < self.length \
|
if ident == 'TEXTREAD' or self.length < 0xE4 or 0xE8 < self.length \
|
||||||
@ -541,6 +541,16 @@ class MobiReader(object):
|
|||||||
pass
|
pass
|
||||||
elif tag.tag == 'img':
|
elif tag.tag == 'img':
|
||||||
tag.set('height', height)
|
tag.set('height', height)
|
||||||
|
else:
|
||||||
|
if tag.tag == 'div' and not tag.text and \
|
||||||
|
(not tag.tail or not tag.tail.strip()) and \
|
||||||
|
not len(list(tag.iterdescendants())):
|
||||||
|
# Paragraph spacer
|
||||||
|
# Insert nbsp so that the element is never
|
||||||
|
# discarded by a renderer
|
||||||
|
tag.text = u'\u00a0' # nbsp
|
||||||
|
styles.append('height: %s' %
|
||||||
|
self.ensure_unit(height))
|
||||||
else:
|
else:
|
||||||
styles.append('margin-top: %s' % self.ensure_unit(height))
|
styles.append('margin-top: %s' % self.ensure_unit(height))
|
||||||
if attrib.has_key('width'):
|
if attrib.has_key('width'):
|
||||||
@ -632,9 +642,18 @@ class MobiReader(object):
|
|||||||
attrib['class'] = cls
|
attrib['class'] = cls
|
||||||
|
|
||||||
for tag in svg_tags:
|
for tag in svg_tags:
|
||||||
p = tag.getparent()
|
images = tag.xpath('descendant::img[@src]')
|
||||||
if hasattr(p, 'remove'):
|
parent = tag.getparent()
|
||||||
p.remove(tag)
|
|
||||||
|
if images and hasattr(parent, 'find'):
|
||||||
|
index = parent.index(tag)
|
||||||
|
for img in images:
|
||||||
|
img.getparent().remove(img)
|
||||||
|
img.tail = img.text = None
|
||||||
|
parent.insert(index, img)
|
||||||
|
|
||||||
|
if hasattr(parent, 'remove'):
|
||||||
|
parent.remove(tag)
|
||||||
|
|
||||||
def create_opf(self, htmlfile, guide=None, root=None):
|
def create_opf(self, htmlfile, guide=None, root=None):
|
||||||
mi = getattr(self.book_header.exth, 'mi', self.embedded_mi)
|
mi = getattr(self.book_header.exth, 'mi', self.embedded_mi)
|
||||||
|
@ -251,7 +251,7 @@ class Serializer(object):
|
|||||||
tag = prefixname(elem.tag, nsrmap)
|
tag = prefixname(elem.tag, nsrmap)
|
||||||
# Previous layers take care of @name
|
# Previous layers take care of @name
|
||||||
id = elem.attrib.pop('id', None)
|
id = elem.attrib.pop('id', None)
|
||||||
if id is not None:
|
if id:
|
||||||
href = '#'.join((item.href, id))
|
href = '#'.join((item.href, id))
|
||||||
offset = self.anchor_offset or buffer.tell()
|
offset = self.anchor_offset or buffer.tell()
|
||||||
self.id_offsets[urlnormalize(href)] = offset
|
self.id_offsets[urlnormalize(href)] = offset
|
||||||
@ -1541,7 +1541,10 @@ class MobiWriter(object):
|
|||||||
exth.write(data)
|
exth.write(data)
|
||||||
nrecs += 1
|
nrecs += 1
|
||||||
if term == 'rights' :
|
if term == 'rights' :
|
||||||
|
try:
|
||||||
rights = unicode(oeb.metadata.rights[0]).encode('utf-8')
|
rights = unicode(oeb.metadata.rights[0]).encode('utf-8')
|
||||||
|
except:
|
||||||
|
rights = 'Unknown'
|
||||||
exth.write(pack('>II', EXTH_CODES['rights'], len(rights) + 8))
|
exth.write(pack('>II', EXTH_CODES['rights'], len(rights) + 8))
|
||||||
exth.write(rights)
|
exth.write(rights)
|
||||||
|
|
||||||
|
@ -1892,7 +1892,7 @@ class OEBBook(object):
|
|||||||
return fix_data(data.decode(bom_enc))
|
return fix_data(data.decode(bom_enc))
|
||||||
except UnicodeDecodeError:
|
except UnicodeDecodeError:
|
||||||
pass
|
pass
|
||||||
if self.input_encoding is not None:
|
if self.input_encoding:
|
||||||
try:
|
try:
|
||||||
return fix_data(data.decode(self.input_encoding, 'replace'))
|
return fix_data(data.decode(self.input_encoding, 'replace'))
|
||||||
except UnicodeDecodeError:
|
except UnicodeDecodeError:
|
||||||
|
@ -199,8 +199,8 @@ class EbookIterator(object):
|
|||||||
not hasattr(self.pathtoopf, 'manifest'):
|
not hasattr(self.pathtoopf, 'manifest'):
|
||||||
if hasattr(self.pathtoopf, 'manifest'):
|
if hasattr(self.pathtoopf, 'manifest'):
|
||||||
self.pathtoopf = write_oebbook(self.pathtoopf, self.base)
|
self.pathtoopf = write_oebbook(self.pathtoopf, self.base)
|
||||||
self.pathtoopf = create_oebbook(self.log, self.pathtoopf, plumber.opts,
|
self.pathtoopf = create_oebbook(self.log, self.pathtoopf,
|
||||||
plumber.input_plugin)
|
plumber.opts)
|
||||||
|
|
||||||
if hasattr(self.pathtoopf, 'manifest'):
|
if hasattr(self.pathtoopf, 'manifest'):
|
||||||
self.pathtoopf = write_oebbook(self.pathtoopf, self.base)
|
self.pathtoopf = write_oebbook(self.pathtoopf, self.base)
|
||||||
@ -227,7 +227,7 @@ class EbookIterator(object):
|
|||||||
self.log.warn('Missing spine item:', repr(spath))
|
self.log.warn('Missing spine item:', repr(spath))
|
||||||
|
|
||||||
cover = self.opf.cover
|
cover = self.opf.cover
|
||||||
if self.ebook_ext in ('lit', 'mobi', 'prc', 'opf') and cover:
|
if self.ebook_ext in ('lit', 'mobi', 'prc', 'opf', 'fb2') and cover:
|
||||||
cfile = os.path.join(self.base, 'calibre_iterator_cover.html')
|
cfile = os.path.join(self.base, 'calibre_iterator_cover.html')
|
||||||
chtml = (TITLEPAGE%os.path.relpath(cover, self.base).replace(os.sep,
|
chtml = (TITLEPAGE%os.path.relpath(cover, self.base).replace(os.sep,
|
||||||
'/')).encode('utf-8')
|
'/')).encode('utf-8')
|
||||||
|
@ -10,7 +10,7 @@ import os
|
|||||||
from calibre.utils.date import isoformat, now
|
from calibre.utils.date import isoformat, now
|
||||||
from calibre import guess_type
|
from calibre import guess_type
|
||||||
|
|
||||||
def meta_info_to_oeb_metadata(mi, m, log):
|
def meta_info_to_oeb_metadata(mi, m, log, override_input_metadata=False):
|
||||||
from calibre.ebooks.oeb.base import OPF
|
from calibre.ebooks.oeb.base import OPF
|
||||||
if not mi.is_null('title'):
|
if not mi.is_null('title'):
|
||||||
m.clear('title')
|
m.clear('title')
|
||||||
@ -29,15 +29,23 @@ def meta_info_to_oeb_metadata(mi, m, log):
|
|||||||
if not mi.is_null('book_producer'):
|
if not mi.is_null('book_producer'):
|
||||||
m.filter('contributor', lambda x : x.role.lower() == 'bkp')
|
m.filter('contributor', lambda x : x.role.lower() == 'bkp')
|
||||||
m.add('contributor', mi.book_producer, role='bkp')
|
m.add('contributor', mi.book_producer, role='bkp')
|
||||||
|
elif override_input_metadata:
|
||||||
|
m.filter('contributor', lambda x : x.role.lower() == 'bkp')
|
||||||
if not mi.is_null('comments'):
|
if not mi.is_null('comments'):
|
||||||
m.clear('description')
|
m.clear('description')
|
||||||
m.add('description', mi.comments)
|
m.add('description', mi.comments)
|
||||||
|
elif override_input_metadata:
|
||||||
|
m.clear('description')
|
||||||
if not mi.is_null('publisher'):
|
if not mi.is_null('publisher'):
|
||||||
m.clear('publisher')
|
m.clear('publisher')
|
||||||
m.add('publisher', mi.publisher)
|
m.add('publisher', mi.publisher)
|
||||||
|
elif override_input_metadata:
|
||||||
|
m.clear('publisher')
|
||||||
if not mi.is_null('series'):
|
if not mi.is_null('series'):
|
||||||
m.clear('series')
|
m.clear('series')
|
||||||
m.add('series', mi.series)
|
m.add('series', mi.series)
|
||||||
|
elif override_input_metadata:
|
||||||
|
m.clear('series')
|
||||||
if not mi.is_null('isbn'):
|
if not mi.is_null('isbn'):
|
||||||
has = False
|
has = False
|
||||||
for x in m.identifier:
|
for x in m.identifier:
|
||||||
@ -46,19 +54,27 @@ def meta_info_to_oeb_metadata(mi, m, log):
|
|||||||
has = True
|
has = True
|
||||||
if not has:
|
if not has:
|
||||||
m.add('identifier', mi.isbn, scheme='ISBN')
|
m.add('identifier', mi.isbn, scheme='ISBN')
|
||||||
|
elif override_input_metadata:
|
||||||
|
m.filter('identifier', lambda x: x.scheme.lower() == 'isbn')
|
||||||
if not mi.is_null('language'):
|
if not mi.is_null('language'):
|
||||||
m.clear('language')
|
m.clear('language')
|
||||||
m.add('language', mi.language)
|
m.add('language', mi.language)
|
||||||
if not mi.is_null('series_index'):
|
if not mi.is_null('series_index'):
|
||||||
m.clear('series_index')
|
m.clear('series_index')
|
||||||
m.add('series_index', mi.format_series_index())
|
m.add('series_index', mi.format_series_index())
|
||||||
|
elif override_input_metadata:
|
||||||
|
m.clear('series_index')
|
||||||
if not mi.is_null('rating'):
|
if not mi.is_null('rating'):
|
||||||
m.clear('rating')
|
m.clear('rating')
|
||||||
m.add('rating', '%.2f'%mi.rating)
|
m.add('rating', '%.2f'%mi.rating)
|
||||||
|
elif override_input_metadata:
|
||||||
|
m.clear('rating')
|
||||||
if not mi.is_null('tags'):
|
if not mi.is_null('tags'):
|
||||||
m.clear('subject')
|
m.clear('subject')
|
||||||
for t in mi.tags:
|
for t in mi.tags:
|
||||||
m.add('subject', t)
|
m.add('subject', t)
|
||||||
|
elif override_input_metadata:
|
||||||
|
m.clear('subject')
|
||||||
if not mi.is_null('pubdate'):
|
if not mi.is_null('pubdate'):
|
||||||
m.clear('date')
|
m.clear('date')
|
||||||
m.add('date', isoformat(mi.pubdate))
|
m.add('date', isoformat(mi.pubdate))
|
||||||
@ -71,6 +87,7 @@ def meta_info_to_oeb_metadata(mi, m, log):
|
|||||||
if not mi.is_null('publication_type'):
|
if not mi.is_null('publication_type'):
|
||||||
m.clear('publication_type')
|
m.clear('publication_type')
|
||||||
m.add('publication_type', mi.publication_type)
|
m.add('publication_type', mi.publication_type)
|
||||||
|
|
||||||
if not m.timestamp:
|
if not m.timestamp:
|
||||||
m.add('timestamp', isoformat(now()))
|
m.add('timestamp', isoformat(now()))
|
||||||
|
|
||||||
@ -78,11 +95,12 @@ def meta_info_to_oeb_metadata(mi, m, log):
|
|||||||
class MergeMetadata(object):
|
class MergeMetadata(object):
|
||||||
'Merge in user metadata, including cover'
|
'Merge in user metadata, including cover'
|
||||||
|
|
||||||
def __call__(self, oeb, mi, opts):
|
def __call__(self, oeb, mi, opts, override_input_metadata=False):
|
||||||
self.oeb, self.log = oeb, oeb.log
|
self.oeb, self.log = oeb, oeb.log
|
||||||
m = self.oeb.metadata
|
m = self.oeb.metadata
|
||||||
self.log('Merging user specified metadata...')
|
self.log('Merging user specified metadata...')
|
||||||
meta_info_to_oeb_metadata(mi, m, oeb.log)
|
meta_info_to_oeb_metadata(mi, m, oeb.log,
|
||||||
|
override_input_metadata=override_input_metadata)
|
||||||
cover_id = self.set_cover(mi, opts.prefer_metadata_cover)
|
cover_id = self.set_cover(mi, opts.prefer_metadata_cover)
|
||||||
m.clear('cover')
|
m.clear('cover')
|
||||||
if cover_id is not None:
|
if cover_id is not None:
|
||||||
|
@ -9,7 +9,6 @@ import os
|
|||||||
from calibre.customize.conversion import InputFormatPlugin
|
from calibre.customize.conversion import InputFormatPlugin
|
||||||
from calibre.ebooks.pdb.header import PdbHeaderReader
|
from calibre.ebooks.pdb.header import PdbHeaderReader
|
||||||
from calibre.ebooks.pdb import PDBError, IDENTITY_TO_NAME, get_reader
|
from calibre.ebooks.pdb import PDBError, IDENTITY_TO_NAME, get_reader
|
||||||
from calibre.ebooks.conversion.utils import PreProcessor
|
|
||||||
|
|
||||||
class PDBInput(InputFormatPlugin):
|
class PDBInput(InputFormatPlugin):
|
||||||
|
|
||||||
@ -32,8 +31,3 @@ class PDBInput(InputFormatPlugin):
|
|||||||
opf = reader.extract_content(os.getcwd())
|
opf = reader.extract_content(os.getcwd())
|
||||||
|
|
||||||
return opf
|
return opf
|
||||||
|
|
||||||
def preprocess_html(self, options, html):
|
|
||||||
self.options = options
|
|
||||||
preprocessor = PreProcessor(self.options, log=getattr(self, 'log', None))
|
|
||||||
return preprocessor(html)
|
|
||||||
|
@ -65,9 +65,9 @@ class Reader(FormatReader):
|
|||||||
from calibre.customize.ui import plugin_for_input_format
|
from calibre.customize.ui import plugin_for_input_format
|
||||||
|
|
||||||
txt_plugin = plugin_for_input_format('txt')
|
txt_plugin = plugin_for_input_format('txt')
|
||||||
for option in txt_plugin.options:
|
for opt in txt_plugin.options:
|
||||||
if not hasattr(self.options, option.option.name):
|
if not hasattr(self.options, opt.option.name):
|
||||||
setattr(self.options, option.name, option.recommended_value)
|
setattr(self.options, opt.option.name, opt.recommended_value)
|
||||||
|
|
||||||
stream.seek(0)
|
stream.seek(0)
|
||||||
return txt_plugin.convert(stream, self.options, 'txt', self.log, {})
|
return txt_plugin.convert(stream, self.options, 'txt', self.log, {})
|
||||||
|
@ -31,9 +31,9 @@ class Reader(FormatReader):
|
|||||||
from calibre.customize.ui import plugin_for_input_format
|
from calibre.customize.ui import plugin_for_input_format
|
||||||
|
|
||||||
pdf_plugin = plugin_for_input_format('pdf')
|
pdf_plugin = plugin_for_input_format('pdf')
|
||||||
for option in pdf_plugin.options:
|
for opt in pdf_plugin.options:
|
||||||
if not hasattr(self.options, option.option.name):
|
if not hasattr(self.options, opt.option.name):
|
||||||
setattr(self.options, option.name, option.recommended_value)
|
setattr(self.options, opt.option.name, opt.recommended_value)
|
||||||
|
|
||||||
pdf.seek(0)
|
pdf.seek(0)
|
||||||
return pdf_plugin.convert(pdf, self.options, 'pdf', self.log, {})
|
return pdf_plugin.convert(pdf, self.options, 'pdf', self.log, {})
|
||||||
|
@ -83,9 +83,9 @@ class Reader(FormatReader):
|
|||||||
from calibre.customize.ui import plugin_for_input_format
|
from calibre.customize.ui import plugin_for_input_format
|
||||||
|
|
||||||
txt_plugin = plugin_for_input_format('txt')
|
txt_plugin = plugin_for_input_format('txt')
|
||||||
for option in txt_plugin.options:
|
for opt in txt_plugin.options:
|
||||||
if not hasattr(self.options, option.option.name):
|
if not hasattr(self.options, opt.option.name):
|
||||||
setattr(self.options, option.name, option.recommended_value)
|
setattr(self.options, opt.option.name, opt.recommended_value)
|
||||||
|
|
||||||
stream.seek(0)
|
stream.seek(0)
|
||||||
return txt_plugin.convert(stream, self.options, 'txt', self.log, {})
|
return txt_plugin.convert(stream, self.options, 'txt', self.log, {})
|
||||||
|
@ -34,18 +34,15 @@ class PML_HTMLizer(object):
|
|||||||
'ra',
|
'ra',
|
||||||
'c',
|
'c',
|
||||||
'r',
|
'r',
|
||||||
't',
|
|
||||||
's',
|
's',
|
||||||
'l',
|
'l',
|
||||||
'k',
|
'k',
|
||||||
'T',
|
|
||||||
'FN',
|
'FN',
|
||||||
'SB',
|
'SB',
|
||||||
]
|
]
|
||||||
|
|
||||||
STATES_VALUE_REQ = [
|
STATES_VALUE_REQ = [
|
||||||
'a',
|
'a',
|
||||||
'T',
|
|
||||||
'FN',
|
'FN',
|
||||||
'SB',
|
'SB',
|
||||||
]
|
]
|
||||||
@ -96,8 +93,6 @@ class PML_HTMLizer(object):
|
|||||||
'Sb': 'sb',
|
'Sb': 'sb',
|
||||||
'c': 'c',
|
'c': 'c',
|
||||||
'r': 'r',
|
'r': 'r',
|
||||||
't': 't',
|
|
||||||
'T': 'T',
|
|
||||||
'i': 'i',
|
'i': 'i',
|
||||||
'I': 'i',
|
'I': 'i',
|
||||||
'u': 'u',
|
'u': 'u',
|
||||||
@ -133,8 +128,6 @@ class PML_HTMLizer(object):
|
|||||||
DIV_STATES = [
|
DIV_STATES = [
|
||||||
'c',
|
'c',
|
||||||
'r',
|
'r',
|
||||||
't',
|
|
||||||
'T',
|
|
||||||
'FN',
|
'FN',
|
||||||
'SB',
|
'SB',
|
||||||
]
|
]
|
||||||
@ -255,8 +248,6 @@ class PML_HTMLizer(object):
|
|||||||
|
|
||||||
for key, val in self.state.items():
|
for key, val in self.state.items():
|
||||||
if val[0]:
|
if val[0]:
|
||||||
if key == 'T':
|
|
||||||
self.state['T'][0] = False
|
|
||||||
if key in self.DIV_STATES:
|
if key in self.DIV_STATES:
|
||||||
div.append(key)
|
div.append(key)
|
||||||
elif key in self.SPAN_STATES:
|
elif key in self.SPAN_STATES:
|
||||||
@ -506,6 +497,9 @@ class PML_HTMLizer(object):
|
|||||||
self.toc = TOC()
|
self.toc = TOC()
|
||||||
self.file_name = file_name
|
self.file_name = file_name
|
||||||
|
|
||||||
|
indent_state = {'t': False, 'T': False}
|
||||||
|
adv_indent_val = ''
|
||||||
|
|
||||||
for s in self.STATES:
|
for s in self.STATES:
|
||||||
self.state[s] = [False, ''];
|
self.state[s] = [False, ''];
|
||||||
|
|
||||||
@ -515,6 +509,8 @@ class PML_HTMLizer(object):
|
|||||||
|
|
||||||
parsed = []
|
parsed = []
|
||||||
empty = True
|
empty = True
|
||||||
|
basic_indent = indent_state['t']
|
||||||
|
adv_indent = indent_state['T']
|
||||||
|
|
||||||
# Must use StringIO, cStringIO does not support unicode
|
# Must use StringIO, cStringIO does not support unicode
|
||||||
line = StringIO.StringIO(line)
|
line = StringIO.StringIO(line)
|
||||||
@ -527,7 +523,7 @@ class PML_HTMLizer(object):
|
|||||||
if c == '\\':
|
if c == '\\':
|
||||||
c = line.read(1)
|
c = line.read(1)
|
||||||
|
|
||||||
if c in 'qcrtTiIuobBlk':
|
if c in 'qcriIuobBlk':
|
||||||
text = self.process_code(c, line)
|
text = self.process_code(c, line)
|
||||||
elif c in 'FS':
|
elif c in 'FS':
|
||||||
l = line.read(1)
|
l = line.read(1)
|
||||||
@ -574,6 +570,15 @@ class PML_HTMLizer(object):
|
|||||||
elif c == 'w':
|
elif c == 'w':
|
||||||
empty = False
|
empty = False
|
||||||
text = '<hr width="%s" />' % self.code_value(line)
|
text = '<hr width="%s" />' % self.code_value(line)
|
||||||
|
elif c == 't':
|
||||||
|
indent_state[c] = not indent_state[c]
|
||||||
|
if indent_state[c]:
|
||||||
|
basic_indent = True
|
||||||
|
elif c == 'T':
|
||||||
|
indent_state[c] = not indent_state[c]
|
||||||
|
if indent_state[c]:
|
||||||
|
adv_indent = True
|
||||||
|
adv_indent_val = self.code_value(line)
|
||||||
elif c == '-':
|
elif c == '-':
|
||||||
empty = False
|
empty = False
|
||||||
text = '­'
|
text = '­'
|
||||||
@ -590,6 +595,16 @@ class PML_HTMLizer(object):
|
|||||||
if not empty:
|
if not empty:
|
||||||
text = self.end_line()
|
text = self.end_line()
|
||||||
parsed.append(text)
|
parsed.append(text)
|
||||||
|
|
||||||
|
if basic_indent:
|
||||||
|
parsed.insert(0, self.STATES_TAGS['t'][0])
|
||||||
|
parsed.append(self.STATES_TAGS['t'][1])
|
||||||
|
elif adv_indent:
|
||||||
|
parsed.insert(0, self.STATES_TAGS['T'][0] % adv_indent_val)
|
||||||
|
parsed.append(self.STATES_TAGS['T'][1])
|
||||||
|
indent_state['T'] = False
|
||||||
|
adv_indent_val = ''
|
||||||
|
|
||||||
output.append(u''.join(parsed))
|
output.append(u''.join(parsed))
|
||||||
line.close()
|
line.close()
|
||||||
|
|
||||||
|
@ -7,7 +7,6 @@ import os, glob, re, textwrap
|
|||||||
from lxml import etree
|
from lxml import etree
|
||||||
|
|
||||||
from calibre.customize.conversion import InputFormatPlugin
|
from calibre.customize.conversion import InputFormatPlugin
|
||||||
from calibre.ebooks.conversion.utils import PreProcessor
|
|
||||||
|
|
||||||
border_style_map = {
|
border_style_map = {
|
||||||
'single' : 'solid',
|
'single' : 'solid',
|
||||||
@ -77,7 +76,15 @@ class RTFInput(InputFormatPlugin):
|
|||||||
|
|
||||||
def generate_xml(self, stream):
|
def generate_xml(self, stream):
|
||||||
from calibre.ebooks.rtf2xml.ParseRtf import ParseRtf
|
from calibre.ebooks.rtf2xml.ParseRtf import ParseRtf
|
||||||
ofile = 'out.xml'
|
ofile = 'dataxml.xml'
|
||||||
|
run_lev, debug_dir = 1, None
|
||||||
|
if getattr(self.opts, 'debug_pipeline', None) is not None:
|
||||||
|
try:
|
||||||
|
os.mkdir(debug_dir)
|
||||||
|
debug_dir = 'rtfdebug'
|
||||||
|
run_lev = 4
|
||||||
|
except:
|
||||||
|
pass
|
||||||
parser = ParseRtf(
|
parser = ParseRtf(
|
||||||
in_file = stream,
|
in_file = stream,
|
||||||
out_file = ofile,
|
out_file = ofile,
|
||||||
@ -115,43 +122,45 @@ class RTFInput(InputFormatPlugin):
|
|||||||
|
|
||||||
# Write or do not write paragraphs. Default is 0.
|
# Write or do not write paragraphs. Default is 0.
|
||||||
empty_paragraphs = 1,
|
empty_paragraphs = 1,
|
||||||
|
|
||||||
|
#debug
|
||||||
|
deb_dir = debug_dir,
|
||||||
|
run_level = run_lev,
|
||||||
)
|
)
|
||||||
parser.parse_rtf()
|
parser.parse_rtf()
|
||||||
ans = open('out.xml').read()
|
with open(ofile, 'rb') as f:
|
||||||
os.remove('out.xml')
|
return f.read()
|
||||||
return ans
|
|
||||||
|
|
||||||
def extract_images(self, picts):
|
def extract_images(self, picts):
|
||||||
|
import imghdr
|
||||||
self.log('Extracting images...')
|
self.log('Extracting images...')
|
||||||
|
|
||||||
|
with open(picts, 'rb') as f:
|
||||||
|
raw = f.read()
|
||||||
|
picts = filter(len, re.findall(r'\{\\pict([^}]+)\}', raw))
|
||||||
|
hex = re.compile(r'[^a-fA-F0-9]')
|
||||||
|
encs = [hex.sub('', pict) for pict in picts]
|
||||||
|
|
||||||
count = 0
|
count = 0
|
||||||
raw = open(picts, 'rb').read()
|
|
||||||
starts = []
|
|
||||||
for match in re.finditer(r'\{\\pict([^}]+)\}', raw):
|
|
||||||
starts.append(match.start(1))
|
|
||||||
|
|
||||||
imap = {}
|
imap = {}
|
||||||
|
for enc in encs:
|
||||||
for start in starts:
|
|
||||||
pos, bc = start, 1
|
|
||||||
while bc > 0:
|
|
||||||
if raw[pos] == '}': bc -= 1
|
|
||||||
elif raw[pos] == '{': bc += 1
|
|
||||||
pos += 1
|
|
||||||
pict = raw[start:pos+1]
|
|
||||||
enc = re.sub(r'[^a-zA-Z0-9]', '', pict)
|
|
||||||
if len(enc) % 2 == 1:
|
if len(enc) % 2 == 1:
|
||||||
enc = enc[:-1]
|
enc = enc[:-1]
|
||||||
data = enc.decode('hex')
|
data = enc.decode('hex')
|
||||||
|
fmt = imghdr.what(None, data)
|
||||||
|
if fmt is None:
|
||||||
|
fmt = 'wmf'
|
||||||
count += 1
|
count += 1
|
||||||
name = (('%4d'%count).replace(' ', '0'))+'.wmf'
|
name = '%04d.%s' % (count, fmt)
|
||||||
open(name, 'wb').write(data)
|
with open(name, 'wb') as f:
|
||||||
|
f.write(data)
|
||||||
imap[count] = name
|
imap[count] = name
|
||||||
#open(name+'.hex', 'wb').write(enc)
|
#open(name+'.hex', 'wb').write(enc)
|
||||||
return self.convert_images(imap)
|
return self.convert_images(imap)
|
||||||
|
|
||||||
def convert_images(self, imap):
|
def convert_images(self, imap):
|
||||||
for count, val in imap.items():
|
self.default_img = None
|
||||||
|
for count, val in imap.iteritems():
|
||||||
try:
|
try:
|
||||||
imap[count] = self.convert_image(val)
|
imap[count] = self.convert_image(val)
|
||||||
except:
|
except:
|
||||||
@ -159,11 +168,34 @@ class RTFInput(InputFormatPlugin):
|
|||||||
return imap
|
return imap
|
||||||
|
|
||||||
def convert_image(self, name):
|
def convert_image(self, name):
|
||||||
from calibre.utils.magick import Image
|
if not name.endswith('.wmf'):
|
||||||
img = Image()
|
return name
|
||||||
img.open(name)
|
try:
|
||||||
|
return self.rasterize_wmf(name)
|
||||||
|
except:
|
||||||
|
self.log.exception('Failed to convert WMF image %r'%name)
|
||||||
|
return self.replace_wmf(name)
|
||||||
|
|
||||||
|
def replace_wmf(self, name):
|
||||||
|
from calibre.ebooks import calibre_cover
|
||||||
|
if self.default_img is None:
|
||||||
|
self.default_img = calibre_cover('Conversion of WMF images is not supported',
|
||||||
|
'Use Microsoft Word or OpenOffice to save this RTF file'
|
||||||
|
' as HTML and convert that in calibre.', title_size=36,
|
||||||
|
author_size=20)
|
||||||
name = name.replace('.wmf', '.jpg')
|
name = name.replace('.wmf', '.jpg')
|
||||||
img.save(name)
|
with open(name, 'wb') as f:
|
||||||
|
f.write(self.default_img)
|
||||||
|
return name
|
||||||
|
|
||||||
|
def rasterize_wmf(self, name):
|
||||||
|
from calibre.utils.wmf.parse import wmf_unwrap
|
||||||
|
with open(name, 'rb') as f:
|
||||||
|
data = f.read()
|
||||||
|
data = wmf_unwrap(data)
|
||||||
|
name = name.replace('.wmf', '.png')
|
||||||
|
with open(name, 'wb') as f:
|
||||||
|
f.write(data)
|
||||||
return name
|
return name
|
||||||
|
|
||||||
|
|
||||||
@ -192,27 +224,27 @@ class RTFInput(InputFormatPlugin):
|
|||||||
css += '\n'+'\n'.join(font_size_classes)
|
css += '\n'+'\n'.join(font_size_classes)
|
||||||
css += '\n' +'\n'.join(color_classes)
|
css += '\n' +'\n'.join(color_classes)
|
||||||
|
|
||||||
for cls, val in border_styles.items():
|
for cls, val in border_styles.iteritems():
|
||||||
css += '\n\n.%s {\n%s\n}'%(cls, val)
|
css += '\n\n.%s {\n%s\n}'%(cls, val)
|
||||||
|
|
||||||
with open('styles.css', 'ab') as f:
|
with open('styles.css', 'ab') as f:
|
||||||
f.write(css)
|
f.write(css)
|
||||||
|
|
||||||
def preprocess(self, fname):
|
# def preprocess(self, fname):
|
||||||
self.log('\tPreprocessing to convert unicode characters')
|
# self.log('\tPreprocessing to convert unicode characters')
|
||||||
try:
|
# try:
|
||||||
data = open(fname, 'rb').read()
|
# data = open(fname, 'rb').read()
|
||||||
from calibre.ebooks.rtf.preprocess import RtfTokenizer, RtfTokenParser
|
# from calibre.ebooks.rtf.preprocess import RtfTokenizer, RtfTokenParser
|
||||||
tokenizer = RtfTokenizer(data)
|
# tokenizer = RtfTokenizer(data)
|
||||||
tokens = RtfTokenParser(tokenizer.tokens)
|
# tokens = RtfTokenParser(tokenizer.tokens)
|
||||||
data = tokens.toRTF()
|
# data = tokens.toRTF()
|
||||||
fname = 'preprocessed.rtf'
|
# fname = 'preprocessed.rtf'
|
||||||
with open(fname, 'wb') as f:
|
# with open(fname, 'wb') as f:
|
||||||
f.write(data)
|
# f.write(data)
|
||||||
except:
|
# except:
|
||||||
self.log.exception(
|
# self.log.exception(
|
||||||
'Failed to preprocess RTF to convert unicode sequences, ignoring...')
|
# 'Failed to preprocess RTF to convert unicode sequences, ignoring...')
|
||||||
return fname
|
# return fname
|
||||||
|
|
||||||
def convert_borders(self, doc):
|
def convert_borders(self, doc):
|
||||||
border_styles = []
|
border_styles = []
|
||||||
@ -249,17 +281,13 @@ class RTFInput(InputFormatPlugin):
|
|||||||
self.log = log
|
self.log = log
|
||||||
self.log('Converting RTF to XML...')
|
self.log('Converting RTF to XML...')
|
||||||
#Name of the preprocesssed RTF file
|
#Name of the preprocesssed RTF file
|
||||||
fname = self.preprocess(stream.name)
|
# fname = self.preprocess(stream.name)
|
||||||
try:
|
try:
|
||||||
xml = self.generate_xml(fname)
|
xml = self.generate_xml(stream.name)
|
||||||
except RtfInvalidCodeException, e:
|
except RtfInvalidCodeException, e:
|
||||||
raise ValueError(_('This RTF file has a feature calibre does not '
|
raise ValueError(_('This RTF file has a feature calibre does not '
|
||||||
'support. Convert it to HTML first and then try it.\n%s')%e)
|
'support. Convert it to HTML first and then try it.\n%s')%e)
|
||||||
|
|
||||||
'''dataxml = open('dataxml.xml', 'w')
|
|
||||||
dataxml.write(xml)
|
|
||||||
dataxml.close'''
|
|
||||||
|
|
||||||
d = glob.glob(os.path.join('*_rtf_pict_dir', 'picts.rtf'))
|
d = glob.glob(os.path.join('*_rtf_pict_dir', 'picts.rtf'))
|
||||||
if d:
|
if d:
|
||||||
imap = {}
|
imap = {}
|
||||||
@ -290,13 +318,9 @@ class RTFInput(InputFormatPlugin):
|
|||||||
res = transform.tostring(result)
|
res = transform.tostring(result)
|
||||||
res = res[:100].replace('xmlns:html', 'xmlns') + res[100:]
|
res = res[:100].replace('xmlns:html', 'xmlns') + res[100:]
|
||||||
# Replace newlines inserted by the 'empty_paragraphs' option in rtf2xml with html blank lines
|
# Replace newlines inserted by the 'empty_paragraphs' option in rtf2xml with html blank lines
|
||||||
if not getattr(self.opts, 'remove_paragraph_spacing', False):
|
|
||||||
res = re.sub('\s*<body>', '<body>', res)
|
res = re.sub('\s*<body>', '<body>', res)
|
||||||
res = re.sub('(?<=\n)\n{2}',
|
res = re.sub('(?<=\n)\n{2}',
|
||||||
u'<p>\u00a0</p>\n'.encode('utf-8'), res)
|
u'<p>\u00a0</p>\n'.encode('utf-8'), res)
|
||||||
if self.opts.preprocess_html:
|
|
||||||
preprocessor = PreProcessor(self.opts, log=getattr(self, 'log', None))
|
|
||||||
res = preprocessor(res)
|
|
||||||
f.write(res)
|
f.write(res)
|
||||||
self.write_inline_css(inline_class, border_styles)
|
self.write_inline_css(inline_class, border_styles)
|
||||||
stream.seek(0)
|
stream.seek(0)
|
||||||
|
@ -262,7 +262,7 @@ class RTFMLizer(object):
|
|||||||
|
|
||||||
if hasattr(elem, 'tail') and elem.tail != None and elem.tail.strip() != '':
|
if hasattr(elem, 'tail') and elem.tail != None and elem.tail.strip() != '':
|
||||||
if 'block' in tag_stack:
|
if 'block' in tag_stack:
|
||||||
text += '%s ' % txt2rtf(elem.tail)
|
text += '%s' % txt2rtf(elem.tail)
|
||||||
else:
|
else:
|
||||||
text += '{\\par \\pard \\hyphpar %s}' % txt2rtf(elem.tail)
|
text += '{\\par \\pard \\hyphpar %s}' % txt2rtf(elem.tail)
|
||||||
|
|
||||||
|
@ -17,7 +17,8 @@
|
|||||||
#########################################################################
|
#########################################################################
|
||||||
# $Revision: 1.41 $
|
# $Revision: 1.41 $
|
||||||
# $Date: 2006/03/24 23:50:07 $
|
# $Date: 2006/03/24 23:50:07 $
|
||||||
import sys,os
|
import sys, os
|
||||||
|
|
||||||
from calibre.ebooks.rtf2xml import headings_to_sections, \
|
from calibre.ebooks.rtf2xml import headings_to_sections, \
|
||||||
line_endings, footnote, fields_small, default_encoding, \
|
line_endings, footnote, fields_small, default_encoding, \
|
||||||
make_lists, preamble_div, header, colors, group_borders, \
|
make_lists, preamble_div, header, colors, group_borders, \
|
||||||
@ -90,7 +91,6 @@ class ParseRtf:
|
|||||||
out_file = '',
|
out_file = '',
|
||||||
out_dir = None,
|
out_dir = None,
|
||||||
dtd = '',
|
dtd = '',
|
||||||
#debug = 0, #why? calibre
|
|
||||||
deb_dir = None,
|
deb_dir = None,
|
||||||
convert_symbol = None,
|
convert_symbol = None,
|
||||||
convert_wingdings = None,
|
convert_wingdings = None,
|
||||||
@ -107,6 +107,7 @@ class ParseRtf:
|
|||||||
no_dtd = 0,
|
no_dtd = 0,
|
||||||
char_data = '',
|
char_data = '',
|
||||||
):
|
):
|
||||||
|
|
||||||
"""
|
"""
|
||||||
Requires:
|
Requires:
|
||||||
'file' --file to parse
|
'file' --file to parse
|
||||||
@ -119,12 +120,11 @@ class ParseRtf:
|
|||||||
script tries to output to directory where is script is exectued.)
|
script tries to output to directory where is script is exectued.)
|
||||||
'deb_dir' --debug directory. If a debug_dir is provided, the script
|
'deb_dir' --debug directory. If a debug_dir is provided, the script
|
||||||
will copy each run through as a file to examine in the debug_dir
|
will copy each run through as a file to examine in the debug_dir
|
||||||
'perl_script'--use perl to make tokens. This runs just a bit faster.
|
|
||||||
(I will probably phase this out.)
|
|
||||||
'check_brackets' -- make sure the brackets match up after each run
|
'check_brackets' -- make sure the brackets match up after each run
|
||||||
through a file. Only for debugging.
|
through a file. Only for debugging.
|
||||||
Returns: Nothing
|
Returns: Nothing
|
||||||
"""
|
"""
|
||||||
|
|
||||||
self.__file = in_file
|
self.__file = in_file
|
||||||
self.__out_file = out_file
|
self.__out_file = out_file
|
||||||
self.__out_dir = out_dir
|
self.__out_dir = out_dir
|
||||||
@ -132,7 +132,7 @@ class ParseRtf:
|
|||||||
self.__dtd_path = dtd
|
self.__dtd_path = dtd
|
||||||
self.__check_file(in_file,"file_to_parse")
|
self.__check_file(in_file,"file_to_parse")
|
||||||
self.__char_data = char_data
|
self.__char_data = char_data
|
||||||
self.__debug_dir = deb_dir #self.__debug_dir = debug calibre
|
self.__debug_dir = deb_dir
|
||||||
self.__check_dir(self.__temp_dir)
|
self.__check_dir(self.__temp_dir)
|
||||||
self.__copy = self.__check_dir(self.__debug_dir)
|
self.__copy = self.__check_dir(self.__debug_dir)
|
||||||
self.__convert_caps = convert_caps
|
self.__convert_caps = convert_caps
|
||||||
@ -155,25 +155,24 @@ class ParseRtf:
|
|||||||
if hasattr(the_file, 'read'): return
|
if hasattr(the_file, 'read'): return
|
||||||
if the_file == None:
|
if the_file == None:
|
||||||
if type == "file_to_parse":
|
if type == "file_to_parse":
|
||||||
message = "You must provide a file for the script to work"
|
msg = "\nYou must provide a file for the script to work"
|
||||||
msg = message
|
|
||||||
raise RtfInvalidCodeException, msg
|
raise RtfInvalidCodeException, msg
|
||||||
elif os.path.exists(the_file):
|
elif os.path.exists(the_file):
|
||||||
pass # do nothing
|
pass # do nothing
|
||||||
else:
|
else:
|
||||||
message = "The file '%s' cannot be found" % the_file
|
msg = "\nThe file '%s' cannot be found" % the_file
|
||||||
msg = message
|
|
||||||
raise RtfInvalidCodeException, msg
|
raise RtfInvalidCodeException, msg
|
||||||
|
|
||||||
def __check_dir(self, the_dir):
|
def __check_dir(self, the_dir):
|
||||||
"""Check to see if directory exists"""
|
"""Check to see if directory exists"""
|
||||||
if not the_dir :
|
if not the_dir :
|
||||||
return
|
return
|
||||||
dir_exists = os.path.isdir(the_dir)
|
dir_exists = os.path.isdir(the_dir)
|
||||||
if not dir_exists:
|
if not dir_exists:
|
||||||
message = "%s is not a directory" % the_dir
|
msg = "\n%s is not a directory" % the_dir
|
||||||
msg = message
|
|
||||||
raise RtfInvalidCodeException, msg
|
raise RtfInvalidCodeException, msg
|
||||||
return 1
|
return 1
|
||||||
|
|
||||||
def parse_rtf(self):
|
def parse_rtf(self):
|
||||||
"""
|
"""
|
||||||
Parse the file by calling on other classes.
|
Parse the file by calling on other classes.
|
||||||
@ -194,13 +193,14 @@ class ParseRtf:
|
|||||||
copy_obj.set_dir(self.__debug_dir)
|
copy_obj.set_dir(self.__debug_dir)
|
||||||
copy_obj.remove_files()
|
copy_obj.remove_files()
|
||||||
copy_obj.copy_file(self.__temp_file, "original_file")
|
copy_obj.copy_file(self.__temp_file, "original_file")
|
||||||
# new as of 2005-08-02. Do I want this?
|
# Function to check if bracket are well handled
|
||||||
if self.__debug_dir or self.__run_level > 2:
|
if self.__debug_dir or self.__run_level > 2:
|
||||||
self.__check_brack_obj = check_brackets.CheckBrackets\
|
self.__check_brack_obj = check_brackets.CheckBrackets\
|
||||||
(file = self.__temp_file,
|
(file = self.__temp_file,
|
||||||
bug_handler = RtfInvalidCodeException,
|
bug_handler = RtfInvalidCodeException,
|
||||||
)
|
)
|
||||||
# convert Macintosh line endings to Unix line endings
|
#convert Macintosh and Windows line endings to Unix line endings
|
||||||
|
#why do this if you don't wb after?
|
||||||
line_obj = line_endings.FixLineEndings(
|
line_obj = line_endings.FixLineEndings(
|
||||||
in_file = self.__temp_file,
|
in_file = self.__temp_file,
|
||||||
bug_handler = RtfInvalidCodeException,
|
bug_handler = RtfInvalidCodeException,
|
||||||
@ -208,13 +208,13 @@ class ParseRtf:
|
|||||||
run_level = self.__run_level,
|
run_level = self.__run_level,
|
||||||
replace_illegals = self.__replace_illegals,
|
replace_illegals = self.__replace_illegals,
|
||||||
)
|
)
|
||||||
return_value = line_obj.fix_endings()
|
return_value = line_obj.fix_endings() #calibre return what?
|
||||||
self.__return_code(return_value)
|
self.__return_code(return_value)
|
||||||
tokenize_obj = tokenize.Tokenize(
|
tokenize_obj = tokenize.Tokenize(
|
||||||
bug_handler = RtfInvalidCodeException,
|
bug_handler = RtfInvalidCodeException,
|
||||||
in_file = self.__temp_file,
|
in_file = self.__temp_file,
|
||||||
copy = self.__copy,
|
copy = self.__copy,
|
||||||
run_level = self.__run_level,)
|
run_level = self.__run_level)
|
||||||
tokenize_obj.tokenize()
|
tokenize_obj.tokenize()
|
||||||
process_tokens_obj = process_tokens.ProcessTokens(
|
process_tokens_obj = process_tokens.ProcessTokens(
|
||||||
in_file = self.__temp_file,
|
in_file = self.__temp_file,
|
||||||
@ -226,15 +226,27 @@ class ParseRtf:
|
|||||||
try:
|
try:
|
||||||
return_value = process_tokens_obj.process_tokens()
|
return_value = process_tokens_obj.process_tokens()
|
||||||
except InvalidRtfException, msg:
|
except InvalidRtfException, msg:
|
||||||
|
#Check to see if the file is correctly encoded
|
||||||
|
encode_obj = default_encoding.DefaultEncoding(
|
||||||
|
in_file = self.__temp_file,
|
||||||
|
run_level = self.__run_level,
|
||||||
|
bug_handler = RtfInvalidCodeException,
|
||||||
|
check_raw = True,
|
||||||
|
)
|
||||||
|
platform, code_page, default_font_num = encode_obj.find_default_encoding()
|
||||||
|
check_encoding_obj = check_encoding.CheckEncoding(
|
||||||
|
bug_handler = RtfInvalidCodeException,
|
||||||
|
)
|
||||||
|
enc = 'cp' + encode_obj.get_codepage()
|
||||||
|
msg = 'Exception in token processing'
|
||||||
|
if check_encoding_obj.check_encoding(self.__file, enc):
|
||||||
|
file_name = self.__file if isinstance(self.__file, str) \
|
||||||
|
else self.__file.encode('utf-8')
|
||||||
|
msg = 'File %s does not appear to be correctly encoded.\n' % file_name
|
||||||
try:
|
try:
|
||||||
os.remove(self.__temp_file)
|
os.remove(self.__temp_file)
|
||||||
except OSError:
|
except OSError:
|
||||||
pass
|
pass
|
||||||
check_encoding_obj = check_encoding.CheckEncoding(
|
|
||||||
bug_handler = RtfInvalidCodeException,
|
|
||||||
)
|
|
||||||
check_encoding_obj.check_encoding(self.__file)
|
|
||||||
sys.stderr.write('File "%s" does not appear to be RTF.\n' % self.__file if isinstance(self.__file, str) else self.__file.encode('utf-8'))
|
|
||||||
raise InvalidRtfException, msg
|
raise InvalidRtfException, msg
|
||||||
delete_info_obj = delete_info.DeleteInfo(
|
delete_info_obj = delete_info.DeleteInfo(
|
||||||
in_file = self.__temp_file,
|
in_file = self.__temp_file,
|
||||||
@ -508,6 +520,7 @@ class ParseRtf:
|
|||||||
indent = self.__indent,
|
indent = self.__indent,
|
||||||
run_level = self.__run_level,
|
run_level = self.__run_level,
|
||||||
no_dtd = self.__no_dtd,
|
no_dtd = self.__no_dtd,
|
||||||
|
encoding = encode_obj.get_codepage(),
|
||||||
bug_handler = RtfInvalidCodeException,
|
bug_handler = RtfInvalidCodeException,
|
||||||
)
|
)
|
||||||
tags_obj.convert_to_tags()
|
tags_obj.convert_to_tags()
|
||||||
@ -520,35 +533,28 @@ class ParseRtf:
|
|||||||
output_obj.output()
|
output_obj.output()
|
||||||
os.remove(self.__temp_file)
|
os.remove(self.__temp_file)
|
||||||
return self.__exit_level
|
return self.__exit_level
|
||||||
|
|
||||||
def __bracket_match(self, file_name):
|
def __bracket_match(self, file_name):
|
||||||
if self.__run_level > 2:
|
if self.__run_level > 2:
|
||||||
good_br, msg = self.__check_brack_obj.check_brackets()
|
good_br, msg = self.__check_brack_obj.check_brackets()
|
||||||
if good_br:
|
if good_br:
|
||||||
pass
|
pass
|
||||||
# sys.stderr.write( msg + ' in ' + file_name + "\n")
|
#sys.stderr.write( msg + ' in ' + file_name + "\n")
|
||||||
else:
|
else:
|
||||||
msg += msg + " in file '" + file_name + "'\n"
|
msg = '%s in file %s\n' % (msg, file_name)
|
||||||
raise RtfInvalidCodeException, msg
|
raise RtfInvalidCodeException, msg
|
||||||
|
|
||||||
def __return_code(self, num):
|
def __return_code(self, num):
|
||||||
if num == None:
|
if num == None:
|
||||||
return
|
return
|
||||||
if int(num) > self.__exit_level:
|
if int(num) > self.__exit_level:
|
||||||
self.__exit_level = num
|
self.__exit_level = num
|
||||||
|
|
||||||
def __make_temp_file(self,file):
|
def __make_temp_file(self,file):
|
||||||
"""Make a temporary file to parse"""
|
"""Make a temporary file to parse"""
|
||||||
write_file="rtf_write_file"
|
write_file="rtf_write_file"
|
||||||
read_obj = file if hasattr(file, 'read') else open(file,'r')
|
read_obj = file if hasattr(file, 'read') else open(file,'r')
|
||||||
write_obj = open(write_file, 'w')
|
with open(write_file, 'wb') as write_obj:
|
||||||
line = "dummy"
|
for line in read_obj:
|
||||||
while line:
|
write_obj.write(line)
|
||||||
line = read_obj.read(1000)
|
|
||||||
write_obj.write(line )
|
|
||||||
write_obj.close()
|
|
||||||
return write_file
|
return write_file
|
||||||
"""
|
|
||||||
mi<tg<open______<style-sheet\n
|
|
||||||
mi<tg<close_____<style-sheet\n
|
|
||||||
mi<tg<open-att__<footnote<num>1\n
|
|
||||||
mi<tg<empty-att_<page-definition<margin>33\n
|
|
||||||
mi<tg<empty_____<para\n
|
|
||||||
"""
|
|
||||||
|
@ -24,38 +24,38 @@ class CheckBrackets:
|
|||||||
self.__ob_count = 0
|
self.__ob_count = 0
|
||||||
self.__cb_count = 0
|
self.__cb_count = 0
|
||||||
self.__open_bracket_num = []
|
self.__open_bracket_num = []
|
||||||
|
|
||||||
def open_brack(self, line):
|
def open_brack(self, line):
|
||||||
num = line[-5:-1]
|
num = line[-5:-1]
|
||||||
self.__open_bracket_num.append(num)
|
self.__open_bracket_num.append(num)
|
||||||
self.__bracket_count += 1
|
self.__bracket_count += 1
|
||||||
|
|
||||||
def close_brack(self, line):
|
def close_brack(self, line):
|
||||||
num = line[-5:-1]
|
num = line[-5:-1]
|
||||||
##self.__open_bracket_num.append(num)
|
|
||||||
try:
|
try:
|
||||||
last_num = self.__open_bracket_num.pop()
|
last_num = self.__open_bracket_num.pop()
|
||||||
except:
|
except:
|
||||||
return 0
|
return False
|
||||||
if num != last_num:
|
if num != last_num:
|
||||||
return 0
|
return False
|
||||||
self.__bracket_count -= 1
|
self.__bracket_count -= 1
|
||||||
return 1
|
return True
|
||||||
|
|
||||||
def check_brackets(self):
|
def check_brackets(self):
|
||||||
read_obj = open(self.__file, 'r')
|
|
||||||
line = 'dummy'
|
|
||||||
line_count = 0
|
line_count = 0
|
||||||
while line:
|
with open(self.__file, 'r') as read_obj:
|
||||||
|
for line in read_obj:
|
||||||
line_count += 1
|
line_count += 1
|
||||||
line = read_obj.readline()
|
|
||||||
self.__token_info = line[:16]
|
self.__token_info = line[:16]
|
||||||
if self.__token_info == 'ob<nu<open-brack':
|
if self.__token_info == 'ob<nu<open-brack':
|
||||||
self.open_brack(line)
|
self.open_brack(line)
|
||||||
if self.__token_info == 'cb<nu<clos-brack':
|
if self.__token_info == 'cb<nu<clos-brack':
|
||||||
right_count = self.close_brack(line)
|
if not self.close_brack(line):
|
||||||
if not right_count:
|
return (False, "closed bracket doesn't match, line %s" % line_count)
|
||||||
return (0, "closed bracket doesn't match, line %s" % line_count)
|
|
||||||
read_obj.close()
|
|
||||||
if self.__bracket_count != 0:
|
if self.__bracket_count != 0:
|
||||||
msg = 'At end of file open and closed brackets don\'t match\n'
|
msg = ('At end of file open and closed brackets don\'t match\n' \
|
||||||
msg = msg + 'total number of brackets is %s' % self.__bracket_count
|
'total number of brackets is %s') % self.__bracket_count
|
||||||
return (0, msg)
|
return (False, msg)
|
||||||
return (1, "brackets match!")
|
return (True, "Brackets match!")
|
||||||
|
|
||||||
|
@ -1,8 +1,11 @@
|
|||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
class CheckEncoding:
|
class CheckEncoding:
|
||||||
|
|
||||||
def __init__(self, bug_handler):
|
def __init__(self, bug_handler):
|
||||||
self.__bug_handler = bug_handler
|
self.__bug_handler = bug_handler
|
||||||
|
|
||||||
def __get_position_error(self, line, encoding, line_num):
|
def __get_position_error(self, line, encoding, line_num):
|
||||||
char_position = 0
|
char_position = 0
|
||||||
for char in line:
|
for char in line:
|
||||||
@ -12,21 +15,23 @@ class CheckEncoding:
|
|||||||
except UnicodeError, msg:
|
except UnicodeError, msg:
|
||||||
sys.stderr.write('line: %s char: %s\n' % (line_num, char_position))
|
sys.stderr.write('line: %s char: %s\n' % (line_num, char_position))
|
||||||
sys.stderr.write(str(msg) + '\n')
|
sys.stderr.write(str(msg) + '\n')
|
||||||
def check_encoding(self, path, encoding='us-ascii'):
|
|
||||||
read_obj = open(path, 'r')
|
def check_encoding(self, path, encoding='us-ascii', verbose=True):
|
||||||
line_to_read = 1
|
|
||||||
line_num = 0
|
line_num = 0
|
||||||
while line_to_read:
|
with open(path, 'r') as read_obj:
|
||||||
|
for line in read_obj:
|
||||||
line_num += 1
|
line_num += 1
|
||||||
line_to_read = read_obj.readline()
|
|
||||||
line = line_to_read
|
|
||||||
try:
|
try:
|
||||||
line.decode(encoding)
|
line.decode(encoding)
|
||||||
except UnicodeError:
|
except UnicodeError:
|
||||||
|
if verbose:
|
||||||
if len(line) < 1000:
|
if len(line) < 1000:
|
||||||
self.__get_position_error(line, encoding, line_num)
|
self.__get_position_error(line, encoding, line_num)
|
||||||
else:
|
else:
|
||||||
sys.stderr.write('line: %d has bad encoding\n'%line_num)
|
sys.stderr.write('line: %d has bad encoding\n' % line_num)
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
check_encoding_obj = CheckEncoding()
|
check_encoding_obj = CheckEncoding()
|
||||||
check_encoding_obj.check_encoding(sys.argv[1])
|
check_encoding_obj.check_encoding(sys.argv[1])
|
||||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user