mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
[Sync] Sync with trunk. Revison 7760
This commit is contained in:
commit
5c52c0f6bb
324
Changelog.yaml
324
Changelog.yaml
@ -4,6 +4,330 @@
|
||||
# for important features/bug fixes.
|
||||
# Also, each release can have new and improved recipes.
|
||||
|
||||
- version: 0.7.42
|
||||
date: 2011-01-21
|
||||
|
||||
new features:
|
||||
- title: "0.7.42 is a re-release of 0.7.41, because conversion to MOBI was broken in 0.7.41"
|
||||
|
||||
- title: "Conversions: Replace the remove header/footer options with a more geenric search replace option, that allows you to not only remove but also replace text"
|
||||
|
||||
- title: "Conversion: The preprocess html option has now become a new 'Heuristic Processing' option which allows you to control exactly which heuristics are used"
|
||||
|
||||
- title: "Conversion: Various improvements to Heuristic Processing (used to be preprocess HTML)"
|
||||
|
||||
- title: "When adding empty books to calibre, optionally set the author to the author of the currently selected book"
|
||||
tickets: [7702]
|
||||
|
||||
- title: "Device drivers for the Archos 101, SmatQ T7 and Acer Lumiread"
|
||||
|
||||
- title: "Catalog generation: Make By Authors optional"
|
||||
|
||||
- title: "Allow bulk editing of Date and Published columns."
|
||||
|
||||
- title: "Add a little button to clear date and published values to the edit metadata dialogs"
|
||||
|
||||
- title: "When adding books by ISBN, allow the specification of special tags that will be added to the new book entries"
|
||||
tickets: [8436]
|
||||
|
||||
- title: "Completion on multiple authors"
|
||||
tickets: [8405]
|
||||
|
||||
- title: "Add AZW to default list of internally viewed formats, a I am tired of getting tickets about it"
|
||||
|
||||
- title: "Nicer error message when catalog generation fails"
|
||||
|
||||
- title: "Add capitalize option to context menus in the edit metadata dialog"
|
||||
|
||||
bug fixes:
|
||||
- title: "RTF Input: Fix regression in 0.7.40 that broke conversion of some old style RTF files"
|
||||
|
||||
- title: "Fix Tag editor forgets position"
|
||||
tickets: [8271]
|
||||
|
||||
- title: "When converting books in the calibre GUI, override metadata from the input document, even when empty."
|
||||
description: >
|
||||
"So if you have removed all the tags and comments in the calibre GUI for the book in the calibre GUI, but the actual file that is being converted still has tags and comments, they are ignored. This affects only conversions in the calibre GUI, not from the command line via ebook-convert."
|
||||
tickets: [8390]
|
||||
|
||||
- title: "Fix memory leak when switching libraries"
|
||||
|
||||
- title: "RTF Output: Fix incorrent spacing between letters."
|
||||
tickets: [8422]
|
||||
|
||||
- title: "Catalog generation: Add composite columns to Merge Comments eligible types"
|
||||
|
||||
- title: "Add a confirmation when closing the add a custom news source dialog."
|
||||
tickets: [8460]
|
||||
|
||||
- title: "Another workaround for LibraryThing UA sniffing that was preventing series metadata download, sigh."
|
||||
tickets: [8477]
|
||||
|
||||
- title: "PD Novel driver: Put books on the SD card into the eBooks folder"
|
||||
|
||||
- title: "When shortening filepaths to conform to windows path length limitations, remove text from the middle of each component instead of the ends."
|
||||
tickets: [8451]
|
||||
|
||||
- title: "Make completion in most places case insensitive"
|
||||
tickets: [8441]
|
||||
|
||||
- title: "Fix regression that caused the N key to stop working when editing a Yes/no column"
|
||||
tickets: [8417]
|
||||
|
||||
- title: "Email: Fix bug when connecting to SMTP relays that use MD5 auth"
|
||||
|
||||
- title: "MOBI Output: Fix bug that could cause a link pointing to the start of a section to go to a point later in the section is the section contained an empty id attribute"
|
||||
|
||||
- title: "When auto converting books and the device is unplugged, do not raise an error."
|
||||
tickets: [8426]
|
||||
|
||||
- title: "Ebook-viewer: Display cover when viewing FB2 files"
|
||||
|
||||
- title: "MOBI Input: Special case handling of emptu div tags with a defined height used as paragraph separators."
|
||||
tickets: [8391]
|
||||
|
||||
- title: "Fix sorting of author names into sub categories by first letter in the Tag Browser when the first letter has diacritics"
|
||||
tickets: [8378]
|
||||
|
||||
- title: "Fix regression in 0.7.40 that caused commas in author names to become | when converting/saving to disk"
|
||||
|
||||
- title: "Fix view specific format on a book with no formats gives an error"
|
||||
tickets: [8352]
|
||||
|
||||
|
||||
improved recipes:
|
||||
- Blic
|
||||
- Las Vegas Review Journal
|
||||
- La Vanguardia
|
||||
- New York Times
|
||||
- El Pais
|
||||
- Seattle Times
|
||||
- Ars Technica
|
||||
- Dilbert
|
||||
- Nature News
|
||||
|
||||
new recipes:
|
||||
- title: "kath.net"
|
||||
author: "Bobus"
|
||||
|
||||
- title: "iHNed"
|
||||
author: "Karel Bilek"
|
||||
|
||||
- title: "Gulf News"
|
||||
author: "Darko Miletic"
|
||||
|
||||
- title: "South Africa Mail and Guardian"
|
||||
author: "77ja65"
|
||||
|
||||
|
||||
- version: 0.7.40
|
||||
date: 2011-01-14
|
||||
|
||||
new features:
|
||||
- title: "A new 'highlight matches' search mode"
|
||||
description: >
|
||||
"There is now a checkbox next to the search bar named 'Highlight'. If you check it, searching will highlight
|
||||
all matched books instead of filtering the book list to all matched books."
|
||||
|
||||
- title: "RTF Input: Improved support for conversion of images. The bug where some images were shrunk should no longer happen"
|
||||
|
||||
- title: "Template language: Allow you to create your own formatting functions. Accessible via Preferences->Advanced->Template functions"
|
||||
|
||||
- title: "News download: Convert various HTML 5 tags into <div> to support readers that cannot handle HTML 5 tags"
|
||||
|
||||
- title: "RTF metadata: Add support for publisher and tags."
|
||||
tickets: [6657]
|
||||
|
||||
- title: "BibTeX catalog: Add support for custom columns"
|
||||
|
||||
- title: "TXT Input: Support for textile markup"
|
||||
|
||||
- title: "Various minor tweaks to improve usability of Preferences->Plugins"
|
||||
|
||||
- title: "TXT Output: Convert <hr> to scene break marker."
|
||||
|
||||
- title: "Support for the Archos 70"
|
||||
|
||||
- title: "SONY Driver: Add an option to automatically refresh the covers on every connect. Accessible via: Preferences->Plugins->Device interface plugins"
|
||||
|
||||
- title: "Add access to the larger template editor from plugboards via context menu."
|
||||
|
||||
- title: "Speed improvement when connecting a large library to a device"
|
||||
|
||||
- title: "Speedup when searching on multiple words in a large library"
|
||||
|
||||
- title: "TXT Input: Add a heauristic formatting processor"
|
||||
|
||||
|
||||
bug fixes:
|
||||
- title: "Fix bug that caused automatic news removal to remove any book that has a tag that contains the word 'news' instead of only books that have the tag News"
|
||||
|
||||
- title: "Refactor the downloading social metadata message box to allow canceling."
|
||||
tickets: [8234]
|
||||
|
||||
- title: "Kobo drive does not deal with Null value in DateCreated column"
|
||||
tickets: [8308]
|
||||
|
||||
- title: "MOBI Input: Fix regression that caused images placed inside svg tags to be discarded"
|
||||
|
||||
- title: "Fix selecting Tablet output profile would actually select the Samsung Galaxy S profile"
|
||||
|
||||
- title: "Catalog generation: Fix a condition that could cause TOCs to not be properly generated in MOBI format catalogs"
|
||||
tickets: [8295]
|
||||
|
||||
- title: "Zip file reading: Be more tolerant when a zip file has a damaged file directory"
|
||||
|
||||
- title: "RTF Input: Various code cleanups. Go back to trying to handle unicode mappings without pre-processing. This will mean that some RTF files that used to convert, won't anymore. Please open tickets and attach them."
|
||||
tickets: [8171]
|
||||
|
||||
- title: "ImageMagick: When identifying an image don't read the entire image"
|
||||
|
||||
- title: "FB2 Output: Add cover to FB2 metadata."
|
||||
|
||||
- title: "Fix inability to customize builting recipe when more than one recipe has the same name"
|
||||
tickets: [8281]
|
||||
|
||||
- title: "RTF Input: Fix regression that broke the Preprocess HTML option"
|
||||
|
||||
- title: "Fix XSS vulnerability in content server."
|
||||
tickets: [7980]
|
||||
|
||||
- title: "TXT Output: Clean up and produce consistant output. Spacing around headings. Headings are not indented when using the remove paragraph spacing option."
|
||||
|
||||
- title: "Catalog generation: Handle invalid covers gracefully"
|
||||
|
||||
- title: "Email settings: Before displaying the email test dialog warn the user that it will expose their email password"
|
||||
|
||||
- title: "PDB Output: Fix regression that caused some PDB files to not work with other software"
|
||||
tickets: [8231]
|
||||
|
||||
improved recipes:
|
||||
- Financial Times UK
|
||||
- Globe and Mail
|
||||
- Wired Daily
|
||||
- MIT Technology Review
|
||||
- MSNBC
|
||||
- expansion.com
|
||||
- New York Times
|
||||
- Heraldo de Aragon
|
||||
- Exiled online
|
||||
|
||||
new recipes:
|
||||
- title: "Yakima Herald and Tri-City Herald"
|
||||
author: "Laura Gjovaag"
|
||||
|
||||
- title: "Wichita Eagle"
|
||||
author: "Jason Cameron"
|
||||
|
||||
- title: "Pressthink and Zero Hedge"
|
||||
author: "Darko Miletic"
|
||||
|
||||
- title: "tyzden"
|
||||
author: "zemiak"
|
||||
|
||||
- title: "El Correo"
|
||||
author: "desUBIKado"
|
||||
|
||||
- title: "Cicero"
|
||||
author: "mad"
|
||||
|
||||
- title: "El Publico"
|
||||
author: "Gerardo Diez"
|
||||
|
||||
- version: 0.7.38
|
||||
date: 2011-01-07
|
||||
|
||||
new features:
|
||||
- title: "Reduce startup time when using a composite custom column"
|
||||
|
||||
- title: "Template language: Add a list_item function for use with tags like columns. See User Manual for details"
|
||||
|
||||
- title: "TXT Input: Attempt to detect the input encoding when not specified. Auto detect paragraph structure and formatting markup."
|
||||
|
||||
- title: "Search & replace: Add ability to manipulate number and boolean columns."
|
||||
|
||||
- title: "Add type ahead completion to the advanced search dialog."
|
||||
tickets: [8035]
|
||||
|
||||
- title: "Double click on plugin in Preferences dialog to customize"
|
||||
tickets: [8175]
|
||||
|
||||
- title: "Allow customization of the SONY driver to send thumbnail to the device. Useful with newer SONY readers"
|
||||
tickets: [8161]
|
||||
|
||||
- title: "Smarten punctuation: Convert double dashes to em dashes. Preprocessing: Various tweaks"
|
||||
|
||||
bug fixes:
|
||||
- title: "Fix regression causing the template formatter to intepret a missing format letter as ERROR instead of 's'."
|
||||
|
||||
- title: "Fix regression that broke conversion of PNG images in PDF files on OS X."
|
||||
tickets: [8215]
|
||||
|
||||
- title: "Content server: Fix improper XML escaping of category titles in the OPDS feeds"
|
||||
tickets: [8225]
|
||||
|
||||
- title: "When decoding XML if the XML starts with a UTF-8 BOM decode as UTF-8. Fixes parsing of FB2 files with UTF-8 BOMs"
|
||||
|
||||
- title: "E-book viewer: When scrolling to a bookmark and the content is wider than the window, do not scroll in the horizontal direction"
|
||||
|
||||
- title: "E-book viewer: Fix next page skipping the bottom of chapters when the content is wider than the window."
|
||||
tickets: [8153]
|
||||
|
||||
- title: " FB2 Output: Insert covers."
|
||||
tickets: [8172]
|
||||
|
||||
- title: "Content server: When serving OPDS feeds handle html descriptions that have namespaced attributes."
|
||||
tickets: [7938]
|
||||
|
||||
- title: "When downloading metadata from isbndb.com, download a maximum of 30 results rather than 1000"
|
||||
|
||||
- title: "Fix sorting of tags column"
|
||||
|
||||
- title: "Change search/replace to show commas instead of vertical bars as the separator for multiple authors"
|
||||
|
||||
- title: "Template language: Make all column names case insensitive"
|
||||
|
||||
- title: "Fix bug that prevent the Disabled option for Tag Browser partiotining from working in the Preferences dialog"
|
||||
|
||||
- title: "Fix bug when using tags like custom column in the template language"
|
||||
|
||||
- title: "Fix bug where composite custom columns using general_program_mode fields are not evaluated correctly when used in a template."
|
||||
|
||||
- title: "ImageMagick interface: Don't crash when asked to open empty image files"
|
||||
|
||||
- title: "Kobo driver: Add TXT,CBZ,CBR to supported formats list"
|
||||
tickets: [8124]
|
||||
|
||||
- title: "Don't uneccessarily scroll the book list horizontally when re-selcting previously selected rows."
|
||||
|
||||
new recipes:
|
||||
- title: "New London Day"
|
||||
author: "Being"
|
||||
|
||||
- title: "Walla"
|
||||
author: "marbs"
|
||||
|
||||
- title: "New Journal of Physics"
|
||||
author: "Chema Cortes"
|
||||
|
||||
- title: "The Baltimore Sun"
|
||||
author: "Josh Hall"
|
||||
|
||||
- title: "Arabian Business and Sunday Times (UK)"
|
||||
author: "Darko Miletic"
|
||||
|
||||
- title: "Deia"
|
||||
author: "Gerardo Diez"
|
||||
|
||||
- title: "Smarter Planet"
|
||||
author: "Jack Mason"
|
||||
|
||||
|
||||
improved recipes:
|
||||
- The Atlantic
|
||||
- Danas
|
||||
- Ledevoir
|
||||
|
||||
- version: 0.7.37
|
||||
date: 2011-01-02
|
||||
|
||||
|
@ -1,6 +1,4 @@
|
||||
@echo OFF
|
||||
REM CalibreRun.bat
|
||||
REM ~~~~~~~~~~~~~~
|
||||
REM Batch File to start a Calibre configuration on Windows
|
||||
REM giving explicit control of the location of:
|
||||
REM - Calibe Program Files
|
||||
@ -24,7 +22,10 @@ REM -------------------------------------
|
||||
REM Set up Calibre Config folder
|
||||
REM -------------------------------------
|
||||
|
||||
If EXIST CalibreConfig SET CALIBRE_CONFIG_DIRECTORY=%cd%\CalibreConfig
|
||||
IF EXIST CalibreConfig (
|
||||
SET CALIBRE_CONFIG_DIRECTORY=%cd%\CalibreConfig
|
||||
ECHO CONFIG=%cd%\CalibreConfig
|
||||
)
|
||||
|
||||
|
||||
REM --------------------------------------------------------------
|
||||
@ -38,9 +39,18 @@ REM drive letter of the USB stick.
|
||||
REM Comment out any of the following that are not to be used
|
||||
REM --------------------------------------------------------------
|
||||
|
||||
IF EXIST U:\eBooks\CalibreLibrary (
|
||||
SET CALIBRE_LIBRARY_DIRECTORY=U:\eBOOKS\CalibreLibrary
|
||||
IF EXIST CalibreLibrary SET CALIBRE_LIBRARY_DIRECTORY=%cd%\CalibreLibrary
|
||||
IF EXIST CalibreBooks SET CALIBRE_LIBRARY_DIRECTORY=%cd%\CalibreBooks
|
||||
ECHO LIBRARY=U:\eBOOKS\CalibreLibrary
|
||||
)
|
||||
IF EXIST CalibreLibrary (
|
||||
SET CALIBRE_LIBRARY_DIRECTORY=%cd%\CalibreLibrary
|
||||
ECHO LIBRARY=%cd%\CalibreLibrary
|
||||
)
|
||||
IF EXIST CalibreBooks (
|
||||
SET CALIBRE_LIBRARY_DIRECTORY=%cd%\CalibreBooks
|
||||
ECHO LIBRARY=%cd%\CalibreBooks
|
||||
)
|
||||
|
||||
|
||||
REM --------------------------------------------------------------
|
||||
@ -50,12 +60,32 @@ REM Location where the metadata.db file is located. If not set
|
||||
REM the same location as Books files will be assumed. This.
|
||||
REM options is used to get better performance when the Library is
|
||||
REM on a (slow) network drive. Putting the metadata.db file
|
||||
REM locally gives a big performance improvement.
|
||||
REM locally makes gives a big performance improvement.
|
||||
REM
|
||||
REM NOTE. If you use this option, then the ability to switch
|
||||
REM libraries within Calibre will be disabled. Therefore
|
||||
REM you do not want to set it if the metadata.db file
|
||||
REM is at the same location as the book files.
|
||||
REM --------------------------------------------------------------
|
||||
|
||||
IF EXIST CalibreBooks SET SET CALIBRE_OVERRIDE_DATABASE_PATH=%cd%\CalibreBooks\metadata.db
|
||||
IF EXIST CalibreMetadata SET CALIBRE_OVERRIDE_DATABASE_PATH=%cd%\CalibreMetadata\metadata.db
|
||||
|
||||
IF EXIST CalibreBooks (
|
||||
IF NOT "%CALIBRE_LIBRARY_DIRECTORY%" == "%cd%\CalibreBooks" (
|
||||
SET SET CALIBRE_OVERRIDE_DATABASE_PATH=%cd%\CalibreBooks\metadata.db
|
||||
ECHO DATABASE=%cd%\CalibreBooks\metadata.db
|
||||
ECHO '
|
||||
ECHO ***CAUTION*** Library Switching will be disabled
|
||||
ECHO '
|
||||
)
|
||||
)
|
||||
IF EXIST CalibreMetadata (
|
||||
IF NOT "%CALIBRE_LIBRARY_DIRECTORY%" == "%cd%\CalibreMetadata" (
|
||||
SET CALIBRE_OVERRIDE_DATABASE_PATH=%cd%\CalibreMetadata\metadata.db
|
||||
ECHO DATABASE=%cd%\CalibreMetadata\metadata.db
|
||||
ECHO '
|
||||
ECHO ***CAUTION*** Library Switching will be disabled
|
||||
ECHO '
|
||||
)
|
||||
)
|
||||
|
||||
REM --------------------------------------------------------------
|
||||
REM Specify Location of source (optional)
|
||||
@ -63,13 +93,20 @@ REM
|
||||
REM It is easy to run Calibre from source
|
||||
REM Just set the environment variable to where the source is located
|
||||
REM When running from source the GUI will have a '*' after the version.
|
||||
REM number that is displayed at the bottom of the Calibre main screen.
|
||||
REM --------------------------------------------------------------
|
||||
|
||||
IF EXIST Calibre\src SET CALIBRE_DEVELOP_FROM=%cd%\Calibre\src
|
||||
|
||||
IF EXIST Calibre\src (
|
||||
SET CALIBRE_DEVELOP_FROM=%cd%\Calibre\src
|
||||
ECHO SOURCE=%cd%\Calibre\src
|
||||
)
|
||||
IF EXIST D:\Calibre\Calibre\src (
|
||||
SET CALIBRE_DEVELOP_FROM=D:\Calibre\Calibre\src
|
||||
ECHO SOURCE=D:\Calibre\Calibre\src
|
||||
)
|
||||
|
||||
REM --------------------------------------------------------------
|
||||
REM Specify Location of calibre binaries (optinal)
|
||||
REM Specify Location of calibre binaries (optional)
|
||||
REM
|
||||
REM To avoid needing Calibre to be set in the search path, ensure
|
||||
REM that Calibre Program Files is current directory when starting.
|
||||
@ -78,21 +115,15 @@ REM This folder can be populated by cpying the Calibre2 folder from
|
||||
REM an existing isntallation or by isntalling direct to here.
|
||||
REM --------------------------------------------------------------
|
||||
|
||||
IF EXIST Calibre2 CD Calibre2
|
||||
|
||||
|
||||
REM --------------------------------------------
|
||||
REM Display settings that will be used
|
||||
REM --------------------------------------------
|
||||
|
||||
echo PROGRAMS=%cd%
|
||||
echo SOURCE=%CALIBRE_DEVELOP_FROM%
|
||||
echo CONFIG=%CALIBRE_CONFIG_DIRECTORY%
|
||||
echo LIBRARY=%CALIBRE_LIBRARY_DIRECTORY%
|
||||
echo DATABASE=%CALIBRE_OVERRIDE_DATABASE_PATH%
|
||||
IF EXIST Calibre2 (
|
||||
Calibre2 CD Calibre2
|
||||
ECHO PROGRAMS=%cd%
|
||||
)
|
||||
|
||||
REM ----------------------------------------------------------
|
||||
REM The following gives a chance to check the settings before
|
||||
REM starting Calibre. It can be commented out if not wanted.
|
||||
REM ----------------------------------------------------------
|
||||
|
||||
echo "Press CTRL-C if you do not want to continue"
|
||||
pause
|
||||
@ -111,4 +142,4 @@ REM Use with /WAIT to wait until Calibre completes to run a task on exit
|
||||
REM --------------------------------------------------------
|
||||
|
||||
echo "Starting up Calibre"
|
||||
START /belownormal Calibre --with-library %CALIBRE_LIBRARY_DIRECTORY%
|
||||
START /belownormal Calibre --with-library "%CALIBRE_LIBRARY_DIRECTORY%"
|
||||
|
42
resources/catalog/section_list_templates.py
Normal file
42
resources/catalog/section_list_templates.py
Normal file
@ -0,0 +1,42 @@
|
||||
#!/usr/bin/env python
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2010, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
'''
|
||||
These templates control the content of titles displayed in the various sections
|
||||
|
||||
Available fields:
|
||||
{title} Title of the book
|
||||
{series} Series name
|
||||
{series_index} Number of the book in the series
|
||||
{rating} Rating
|
||||
{rating_parens} Rating, in parentheses
|
||||
{pubyear} Year the book was published
|
||||
{pubyear_parens} Year the book was published, in parentheses
|
||||
|
||||
'''
|
||||
# Books by Author
|
||||
by_authors_normal_title_template = '{title} {pubyear_parens}'
|
||||
by_authors_series_title_template = '[{series_index}] {title} {pubyear_parens}'
|
||||
|
||||
# Books by Title
|
||||
by_titles_normal_title_template = '{title}'
|
||||
by_titles_series_title_template = '{title} ({series} [{series_index}])'
|
||||
|
||||
# Books by Series
|
||||
by_series_title_template = '[{series_index}] {title} {pubyear_parens}'
|
||||
|
||||
# Books by Genre
|
||||
by_genres_normal_title_template = '{title} {pubyear_parens}'
|
||||
by_genres_series_title_template = '{series_index}. {title} {pubyear_parens}'
|
||||
|
||||
# Recently Added
|
||||
by_recently_added_normal_title_template = '{title}'
|
||||
by_recently_added_series_title_template = '{title} ({series} [{series_index}])'
|
||||
|
||||
# By Month added
|
||||
by_month_added_normal_title_template = '{title} {pubyear_parens}'
|
||||
by_month_added_series_title_template = '[{series_index}] {title} {pubyear_parens}'
|
BIN
resources/images/document-encrypt.png
Normal file
BIN
resources/images/document-encrypt.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 8.8 KiB |
BIN
resources/images/heuristics.png
Normal file
BIN
resources/images/heuristics.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 9.3 KiB |
BIN
resources/images/news/exiled.png
Normal file
BIN
resources/images/news/exiled.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 1.3 KiB |
BIN
resources/images/news/pressthink.png
Normal file
BIN
resources/images/news/pressthink.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 533 B |
BIN
resources/images/news/zerohedge.png
Normal file
BIN
resources/images/news/zerohedge.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 3.0 KiB |
BIN
resources/images/template_funcs.png
Normal file
BIN
resources/images/template_funcs.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 16 KiB |
@ -1,6 +1,5 @@
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2008-2010, Darko Miletic <darko.miletic at gmail.com>'
|
||||
__copyright__ = '2008-2011, Darko Miletic <darko.miletic at gmail.com>'
|
||||
'''
|
||||
arstechnica.com
|
||||
'''
|
||||
@ -9,19 +8,26 @@ import re
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag
|
||||
|
||||
class ArsTechnica2(BasicNewsRecipe):
|
||||
class ArsTechnica(BasicNewsRecipe):
|
||||
title = u'Ars Technica'
|
||||
language = 'en'
|
||||
__author__ = 'Darko Miletic and Sujata Raman'
|
||||
__author__ = 'Darko Miletic, Sujata Raman, Alexis Rohou'
|
||||
description = 'The art of technology'
|
||||
publisher = 'Ars Technica'
|
||||
category = 'news, IT, technology'
|
||||
oldest_article = 2
|
||||
oldest_article = 5
|
||||
max_articles_per_feed = 100
|
||||
no_stylesheets = True
|
||||
encoding = 'utf-8'
|
||||
use_embedded_content = False
|
||||
extra_css = ' body {font-family: Arial,Helvetica,sans-serif} .title{text-align: left} .byline{font-weight: bold; line-height: 1em; font-size: 0.625em; text-decoration: none} '
|
||||
extra_css = '''
|
||||
body {font-family: Arial,Helvetica,sans-serif}
|
||||
.title{text-align: left}
|
||||
.byline{font-weight: bold; line-height: 1em; font-size: 0.625em; text-decoration: none}
|
||||
.news-item-figure-caption-text{font-size:small; font-style:italic}
|
||||
.news-item-figure-caption-byline{font-size:small; font-style:italic; font-weight:bold}
|
||||
'''
|
||||
ignoreEtcArticles = True # Etc feed items can be ignored, as they're not real stories
|
||||
|
||||
conversion_options = {
|
||||
'comments' : description
|
||||
@ -31,10 +37,10 @@ class ArsTechnica2(BasicNewsRecipe):
|
||||
}
|
||||
|
||||
|
||||
preprocess_regexps = [
|
||||
(re.compile(r'<div class="news-item-figure', re.DOTALL|re.IGNORECASE),lambda match: '<div class="news-item-figure"')
|
||||
,(re.compile(r'</title>.*?</head>', re.DOTALL|re.IGNORECASE),lambda match: '</title></head>')
|
||||
]
|
||||
#preprocess_regexps = [
|
||||
# (re.compile(r'<div class="news-item-figure', re.DOTALL|re.IGNORECASE),lambda match: '<div class="news-item-figure"')
|
||||
# ,(re.compile(r'</title>.*?</head>', re.DOTALL|re.IGNORECASE),lambda match: '</title></head>')
|
||||
# ]
|
||||
|
||||
keep_only_tags = [dict(name='div', attrs={'id':['story','etc-story']})]
|
||||
|
||||
@ -42,7 +48,7 @@ class ArsTechnica2(BasicNewsRecipe):
|
||||
dict(name=['object','link','embed'])
|
||||
,dict(name='div', attrs={'class':'read-more-link'})
|
||||
]
|
||||
remove_attributes=['width','height']
|
||||
#remove_attributes=['width','height']
|
||||
|
||||
feeds = [
|
||||
(u'Infinite Loop (Apple content)' , u'http://feeds.arstechnica.com/arstechnica/apple/' )
|
||||
@ -56,6 +62,7 @@ class ArsTechnica2(BasicNewsRecipe):
|
||||
,(u'Law & Disorder (Tech policy content)' , u'http://feeds.arstechnica.com/arstechnica/tech-policy/')
|
||||
]
|
||||
|
||||
# This deals with multi-page stories
|
||||
def append_page(self, soup, appendtag, position):
|
||||
pager = soup.find('div',attrs={'class':'pager'})
|
||||
if pager:
|
||||
@ -81,6 +88,7 @@ class ArsTechnica2(BasicNewsRecipe):
|
||||
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
# Adds line breaks near the byline (not sure why this is needed)
|
||||
ftag = soup.find('div', attrs={'class':'byline'})
|
||||
if ftag:
|
||||
brtag = Tag(soup,'br')
|
||||
@ -88,12 +96,33 @@ class ArsTechnica2(BasicNewsRecipe):
|
||||
ftag.insert(4,brtag)
|
||||
ftag.insert(5,brtag2)
|
||||
|
||||
# Remove style items
|
||||
for item in soup.findAll(style=True):
|
||||
del item['style']
|
||||
|
||||
# Remove id
|
||||
for item in soup.findAll(id=True):
|
||||
del item['id']
|
||||
|
||||
# For some reason, links to authors don't have the domainname
|
||||
a_author = soup.find('a',{'href':re.compile("^/author")})
|
||||
if a_author:
|
||||
a_author['href'] = 'http://arstechnica.com'+a_author['href']
|
||||
|
||||
# within div class news-item-figure, we need to grab images
|
||||
|
||||
# Deal with multi-page stories
|
||||
self.append_page(soup, soup.body, 3)
|
||||
|
||||
return soup
|
||||
|
||||
def get_article_url(self, article):
|
||||
# If the article title starts with Etc:, don't return it
|
||||
if self.ignoreEtcArticles:
|
||||
article_title = article.get('title',None)
|
||||
if re.match('Etc: ',article_title) is not None:
|
||||
return None
|
||||
|
||||
# The actual article is in a guid tag
|
||||
return article.get('guid', None).rpartition('?')[0]
|
||||
|
||||
|
@ -1,6 +1,6 @@
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2008-2010, Darko Miletic <darko.miletic at gmail.com>'
|
||||
__copyright__ = '2008-2011, Darko Miletic <darko.miletic at gmail.com>'
|
||||
'''
|
||||
blic.rs
|
||||
'''
|
||||
@ -21,21 +21,53 @@ class Blic(BasicNewsRecipe):
|
||||
masthead_url = 'http://www.blic.rs/resources/images/header/header_back.png'
|
||||
language = 'sr'
|
||||
publication_type = 'newspaper'
|
||||
extra_css = '@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)} body{font-family: Georgia, serif1, serif} .article_description{font-family: Arial, sans1, sans-serif} .img_full{float: none} img{margin-bottom: 0.8em} '
|
||||
extra_css = """
|
||||
@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)}
|
||||
@font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)}
|
||||
body{font-family: Georgia, serif1, serif}
|
||||
.articledescription,#nadnaslov,.article_info{font-family: Arial, sans1, sans-serif}
|
||||
.img_full{float: none}
|
||||
#nadnaslov{font-size: small}
|
||||
#article_lead{font-size: 1.5em}
|
||||
h1{color: red}
|
||||
.potpis{font-size: x-small; color: gray}
|
||||
.article_info{font-size: small}
|
||||
img{margin-bottom: 0.8em; margin-top: 0.8em; display: block}
|
||||
"""
|
||||
|
||||
conversion_options = {
|
||||
'comment' : description
|
||||
, 'tags' : category
|
||||
, 'publisher': publisher
|
||||
, 'language' : language
|
||||
, 'linearize_tables' : True
|
||||
}
|
||||
|
||||
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
|
||||
remove_tags_before = dict(name='div', attrs={'id':'article_info'})
|
||||
remove_tags = [dict(name=['object','link'])]
|
||||
remove_attributes = ['width','height']
|
||||
remove_tags = [dict(name=['object','link','meta','base','object','embed'])]
|
||||
remove_attributes = ['width','height','m_id','m_ext','mlg_id','poll_id','v_id']
|
||||
|
||||
feeds = [(u'Danasnje Vesti', u'http://www.blic.rs/rss/danasnje-vesti')]
|
||||
feeds = [
|
||||
(u'Politika' , u'http://www.blic.rs/rss/Vesti/Politika')
|
||||
,(u'Tema Dana' , u'http://www.blic.rs/rss/Vesti/Tema-Dana')
|
||||
,(u'Svet' , u'http://www.blic.rs/rss/Vesti/Svet')
|
||||
,(u'Drustvo' , u'http://www.blic.rs/rss/Vesti/Drustvo')
|
||||
,(u'Ekonomija' , u'http://www.blic.rs/rss/Vesti/Ekonomija')
|
||||
,(u'Hronika' , u'http://www.blic.rs/rss/Vesti/Hronika')
|
||||
,(u'Beograd' , u'http://www.blic.rs/rss/Vesti/Beograd')
|
||||
,(u'Srbija' , u'http://www.blic.rs/rss/Vesti/Srbija')
|
||||
,(u'Vojvodina' , u'http://www.blic.rs/rss/Vesti/Vojvodina')
|
||||
,(u'Republika Srpska' , u'http://www.blic.rs/rss/Vesti/Republika-Srpska')
|
||||
,(u'Reportaza' , u'http://www.blic.rs/rss/Vesti/Reportaza')
|
||||
,(u'Dodatak' , u'http://www.blic.rs/rss/Vesti/Dodatak')
|
||||
,(u'Zabava' , u'http://www.blic.rs/rss/Zabava')
|
||||
,(u'Kultura' , u'http://www.blic.rs/rss/Kultura')
|
||||
,(u'Slobodno Vreme' , u'http://www.blic.rs/rss/Slobodno-vreme')
|
||||
,(u'IT' , u'http://www.blic.rs/rss/IT')
|
||||
,(u'Komentar' , u'http://www.blic.rs/rss/Komentar')
|
||||
,(u'Intervju' , u'http://www.blic.rs/rss/Intervju')
|
||||
]
|
||||
|
||||
|
||||
def print_version(self, url):
|
||||
@ -44,4 +76,4 @@ class Blic(BasicNewsRecipe):
|
||||
def preprocess_html(self, soup):
|
||||
for item in soup.findAll(style=True):
|
||||
del item['style']
|
||||
return self.adeify_images(soup)
|
||||
return soup
|
||||
|
35
resources/recipes/cicero.recipe
Normal file
35
resources/recipes/cicero.recipe
Normal file
@ -0,0 +1,35 @@
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class Cicero(BasicNewsRecipe):
|
||||
timefmt = ' [%Y-%m-%d]'
|
||||
title = u'Cicero'
|
||||
__author__ = 'mad@sharktooth.de'
|
||||
description = u'Magazin f\xfcr politische Kultur'
|
||||
oldest_article = 7
|
||||
language = 'de'
|
||||
max_articles_per_feed = 100
|
||||
no_stylesheets = True
|
||||
use_embedded_content = False
|
||||
publisher = 'Ringier Publishing'
|
||||
category = 'news, politics, Germany'
|
||||
encoding = 'iso-8859-1'
|
||||
publication_type = 'magazine'
|
||||
masthead_url = 'http://www.cicero.de/img2/cicero_logo_rss.gif'
|
||||
feeds = [
|
||||
(u'Das gesamte Portfolio', u'http://www.cicero.de/rss/rss.php?ress_id='),
|
||||
#(u'Alle Heft-Inhalte', u'http://www.cicero.de/rss/rss.php?ress_id=heft'),
|
||||
#(u'Alle Online-Inhalte', u'http://www.cicero.de/rss/rss.php?ress_id=online'),
|
||||
#(u'Berliner Republik', u'http://www.cicero.de/rss/rss.php?ress_id=4'),
|
||||
#(u'Weltb\xfchne', u'http://www.cicero.de/rss/rss.php?ress_id=1'),
|
||||
#(u'Salon', u'http://www.cicero.de/rss/rss.php?ress_id=7'),
|
||||
#(u'Kapital', u'http://www.cicero.de/rss/rss.php?ress_id=6'),
|
||||
#(u'Netzst\xfccke', u'http://www.cicero.de/rss/rss.php?ress_id=9'),
|
||||
#(u'Leinwand', u'http://www.cicero.de/rss/rss.php?ress_id=12'),
|
||||
#(u'Bibliothek', u'http://www.cicero.de/rss/rss.php?ress_id=15'),
|
||||
(u'Kolumne - Alle Kolulmnen', u'http://www.cicero.de/rss/rss2.php?ress_id='),
|
||||
#(u'Kolumne - Schreiber, Berlin', u'http://www.cicero.de/rss/rss2.php?ress_id=35'),
|
||||
#(u'Kolumne - TV Kritik', u'http://www.cicero.de/rss/rss2.php?ress_id=34')
|
||||
]
|
||||
|
||||
def print_version(self, url):
|
||||
return 'http://www.cicero.de/page_print.php?' + url.rpartition('?')[2]
|
@ -11,7 +11,7 @@ class CNetJapan(BasicNewsRecipe):
|
||||
(u'CNet Blog', u'http://feed.japan.cnet.com/rss/blog/index.rdf')
|
||||
]
|
||||
language = 'ja'
|
||||
encoding = 'Shift_JIS'
|
||||
encoding = 'utf-8'
|
||||
remove_javascript = True
|
||||
|
||||
preprocess_regexps = [
|
||||
|
@ -7,22 +7,29 @@ class DallasNews(BasicNewsRecipe):
|
||||
max_articles_per_feed = 25
|
||||
|
||||
no_stylesheets = True
|
||||
remove_tags_before = dict(name='h2', attrs={'class':'vitstoryheadline'})
|
||||
remove_tags_after = dict(name='div', attrs={'style':'width: 100%; clear: right'})
|
||||
remove_tags_after = dict(name='div', attrs={'id':'article_tools_bottom'})
|
||||
use_embedded_content = False
|
||||
remove_tags_before = dict(name='h1')
|
||||
keep_only_tags = {'class':lambda x: x and 'article' in x}
|
||||
remove_tags = [
|
||||
dict(name='iframe'),
|
||||
dict(name='div', attrs={'class':'biblockmore'}),
|
||||
dict(name='div', attrs={'style':'width: 100%; clear: right'}),
|
||||
dict(name='div', attrs={'id':'article_tools_bottom'}),
|
||||
#dict(name='ul', attrs={'class':'articleTools'}),
|
||||
{'class':['DMNSocialTools', 'article ', 'article first ', 'article premium']},
|
||||
]
|
||||
|
||||
feeds = [
|
||||
('Latest News', 'http://www.dallasnews.com/newskiosk/rss/dallasnewslatestnews.xml'),
|
||||
('Local News', 'http://www.dallasnews.com/newskiosk/rss/dallasnewslocalnews.xml'),
|
||||
('Nation and World', 'http://www.dallasnews.com/newskiosk/rss/dallasnewsnationworld.xml'),
|
||||
('Politics', 'http://www.dallasnews.com/newskiosk/rss/dallasnewsnationalpolitics.xml'),
|
||||
('Science', 'http://www.dallasnews.com/newskiosk/rss/dallasnewsscience.xml'),
|
||||
('Local News',
|
||||
'http://www.dallasnews.com/news/politics/local-politics/?rss'),
|
||||
('National Politics',
|
||||
'http://www.dallasnews.com/news/politics/national-politic/?rss'),
|
||||
('State Politics',
|
||||
'http://www.dallasnews.com/news/politics/state-politics/?rss'),
|
||||
('Religion',
|
||||
'http://www.dallasnews.com/news/religion/?rss'),
|
||||
('Crime',
|
||||
'http://www.dallasnews.com/news/crime/headlines/?rss'),
|
||||
('Celebrity News',
|
||||
'http://www.dallasnews.com/entertainment/celebrity-news/?rss&listname=TopStories'),
|
||||
('Nation',
|
||||
'http://www.dallasnews.com/news/nation-world/nation/?rss'),
|
||||
('World',
|
||||
'http://www.dallasnews.com/news/nation-world/world/?rss'),
|
||||
]
|
||||
|
||||
|
@ -22,7 +22,7 @@ class Deia(BasicNewsRecipe):
|
||||
cover_url ='http://2.bp.blogspot.com/_RjrWzC6tI14/TM6jrPLaBZI/AAAAAAAAFaI/ayffwxidFEY/s1600/2009-10-13-logo-deia.jpg'
|
||||
timefmt ='[%a, %d %b, %Y]'
|
||||
encoding ='utf8'
|
||||
language ='es_ES'
|
||||
language ='es'
|
||||
remove_javascript =True
|
||||
remove_tags_after =dict(id='Texto')
|
||||
remove_tags_before =dict(id='Texto')
|
||||
|
@ -28,7 +28,7 @@ class DilbertBig(BasicNewsRecipe):
|
||||
,'publisher' : publisher
|
||||
}
|
||||
|
||||
feeds = [(u'Dilbert', u'http://feeds.dilbert.com/DilbertDailyStrip' )]
|
||||
feeds = [(u'Dilbert', u'http://feed.dilbert.com/dilbert/daily_strip' )]
|
||||
|
||||
def get_article_url(self, article):
|
||||
return article.get('feedburner_origlink', None)
|
||||
|
@ -9,7 +9,7 @@ from calibre.web.feeds.news import BasicNewsRecipe
|
||||
from calibre.ebooks.BeautifulSoup import BeautifulSoup
|
||||
from calibre.ebooks.BeautifulSoup import Tag, NavigableString
|
||||
|
||||
import mechanize, string, urllib, time, re
|
||||
import string, time, re
|
||||
|
||||
class Economist(BasicNewsRecipe):
|
||||
|
||||
@ -18,19 +18,19 @@ class Economist(BasicNewsRecipe):
|
||||
|
||||
__author__ = "Kovid Goyal"
|
||||
INDEX = 'http://www.economist.com/printedition'
|
||||
description = ('Global news and current affairs from a European perspective.'
|
||||
' Needs a subscription from ')+INDEX
|
||||
description = 'Global news and current affairs from a European perspective.'
|
||||
|
||||
oldest_article = 7.0
|
||||
cover_url = 'http://www.economist.com/images/covers/currentcoverus_large.jpg'
|
||||
remove_tags = [dict(name=['script', 'noscript', 'title', 'iframe', 'cf_floatingcontent']),
|
||||
dict(attrs={'class':['dblClkTrk', 'ec-article-info']})]
|
||||
keep_only_tags = [dict(id='ec-article-body')]
|
||||
needs_subscription = True
|
||||
needs_subscription = False
|
||||
no_stylesheets = True
|
||||
preprocess_regexps = [(re.compile('</html>.*', re.DOTALL),
|
||||
lambda x:'</html>')]
|
||||
|
||||
'''
|
||||
def get_browser(self):
|
||||
br = BasicNewsRecipe.get_browser()
|
||||
br.open('http://www.economist.com')
|
||||
@ -50,6 +50,7 @@ class Economist(BasicNewsRecipe):
|
||||
}))
|
||||
br.open(req).read()
|
||||
return br
|
||||
'''
|
||||
|
||||
def parse_index(self):
|
||||
try:
|
||||
|
@ -7,12 +7,12 @@ from lxml import html
|
||||
|
||||
class Economist(BasicNewsRecipe):
|
||||
|
||||
title = 'The Economist (free)'
|
||||
title = 'The Economist (RSS)'
|
||||
language = 'en'
|
||||
|
||||
__author__ = "Kovid Goyal"
|
||||
description = ('Global news and current affairs from a European perspective.'
|
||||
' Much slower than the subscription based version.')
|
||||
' Much slower than the print edition based version.')
|
||||
|
||||
oldest_article = 7.0
|
||||
cover_url = 'http://www.economist.com/images/covers/currentcoverus_large.jpg'
|
||||
|
122
resources/recipes/el_correo.recipe
Normal file
122
resources/recipes/el_correo.recipe
Normal file
@ -0,0 +1,122 @@
|
||||
#!/usr/bin/env python
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '08 Januery 2011, desUBIKado'
|
||||
__author__ = 'desUBIKado'
|
||||
__description__ = 'Daily newspaper from Biscay'
|
||||
__version__ = 'v0.08'
|
||||
__date__ = '08, Januery 2011'
|
||||
'''
|
||||
[url]http://www.elcorreo.com/[/url]
|
||||
'''
|
||||
|
||||
import time
|
||||
import re
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class heraldo(BasicNewsRecipe):
|
||||
__author__ = 'desUBIKado'
|
||||
description = 'Daily newspaper from Biscay'
|
||||
title = u'El Correo'
|
||||
publisher = 'Vocento'
|
||||
category = 'News, politics, culture, economy, general interest'
|
||||
oldest_article = 2
|
||||
delay = 1
|
||||
max_articles_per_feed = 100
|
||||
no_stylesheets = True
|
||||
use_embedded_content = False
|
||||
language = 'es'
|
||||
timefmt = '[%a, %d %b, %Y]'
|
||||
encoding = 'iso-8859-1'
|
||||
remove_empty_feeds = True
|
||||
remove_javascript = False
|
||||
|
||||
feeds = [
|
||||
(u'Portada', u'http://www.elcorreo.com/vizcaya/portada.xml'),
|
||||
(u'Local', u'http://www.elcorreo.com/vizcaya/rss/feeds/vizcaya.xml'),
|
||||
(u'Internacional', u'hhttp://www.elcorreo.com/vizcaya/rss/feeds/internacional.xml'),
|
||||
(u'Econom\xeda', u'http://www.elcorreo.com/vizcaya/rss/feeds/economia.xml'),
|
||||
(u'Pol\xedtica', u'http://www.elcorreo.com/vizcaya/rss/feeds/politica.xml'),
|
||||
(u'Opini\xf3n', u'http://www.elcorreo.com/vizcaya/rss/feeds/opinion.xml'),
|
||||
(u'Deportes', u'http://www.elcorreo.com/vizcaya/rss/feeds/deportes.xml'),
|
||||
(u'Sociedad', u'http://www.elcorreo.com/vizcaya/rss/feeds/sociedad.xml'),
|
||||
(u'Cultura', u'http://www.elcorreo.com/vizcaya/rss/feeds/cultura.xml'),
|
||||
(u'Televisi\xf3n', u'http://www.elcorreo.com/vizcaya/rss/feeds/television.xml'),
|
||||
(u'Gente', u'http://www.elcorreo.com/vizcaya/rss/feeds/gente.xml')
|
||||
]
|
||||
|
||||
keep_only_tags = [
|
||||
dict(name='div', attrs={'class':['grouphead','date','art_head','story-texto','text','colC_articulo','contenido_comentarios']}),
|
||||
dict(name='div' , attrs={'id':['articulo','story-texto','story-entradilla']})
|
||||
]
|
||||
|
||||
remove_tags = [
|
||||
dict(name='div', attrs={'class':['art_barra','detalles-opinion','formdenunciar','modulo calculadoras','nubetags','pie']}),
|
||||
dict(name='div', attrs={'class':['mod_lomas','bloque_lomas','blm_header','link-app3','link-app4','botones_listado']}),
|
||||
dict(name='div', attrs={'class':['navegacion_galeria','modulocanalpromocion','separa','separacion','compartir','tags_relacionados']}),
|
||||
dict(name='div', attrs={'class':['moduloBuscadorDeportes','modulo-gente','moddestacadopeq','OpcArt','articulopiniones']}),
|
||||
dict(name='div', attrs={'class':['modulo-especial','publiEspecial']}),
|
||||
dict(name='div', attrs={'id':['articulopina']}),
|
||||
dict(name='br', attrs={'class':'clear'}),
|
||||
dict(name='form', attrs={'name':'frm_conversor2'})
|
||||
]
|
||||
|
||||
remove_tags_before = dict(name='div' , attrs={'class':'articulo '})
|
||||
remove_tags_after = dict(name='div' , attrs={'class':'comentarios'})
|
||||
|
||||
def get_cover_url(self):
|
||||
cover = None
|
||||
st = time.localtime()
|
||||
year = str(st.tm_year)
|
||||
month = "%.2d" % st.tm_mon
|
||||
day = "%.2d" % st.tm_mday
|
||||
#[url]http://img.kiosko.net/2011/01/02/es/elcorreo.750.jpg[/url]
|
||||
#[url]http://info.elcorreo.com/pdf/06012011-viz.pdf[/url]
|
||||
cover='http://info.elcorreo.com/pdf/'+ day + month + year +'-viz.pdf'
|
||||
|
||||
br = BasicNewsRecipe.get_browser()
|
||||
try:
|
||||
br.open(cover)
|
||||
except:
|
||||
self.log("\nPortada no disponible")
|
||||
cover ='http://www.elcorreo.com/vizcaya/noticias/201002/02/Media/logo-elcorreo-nuevo.png'
|
||||
return cover
|
||||
|
||||
extra_css = '''
|
||||
h1, .headline {font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:30px;}
|
||||
h2, .subhead {font-family:Arial,Helvetica,sans-serif; font-style:italic; font-weight:normal;font-size:18px;}
|
||||
h3, .overhead {font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:16px;}
|
||||
h4 {font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:16px;}
|
||||
h5 {font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:16px;}
|
||||
h6 {font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:16px;}
|
||||
.date,.byline, .photo {font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:14px;}
|
||||
img{margin-bottom: 0.4em}
|
||||
'''
|
||||
|
||||
|
||||
|
||||
preprocess_regexps = [
|
||||
|
||||
# To present the image of the embedded video
|
||||
(re.compile(r'var RUTA_IMAGEN', re.DOTALL|re.IGNORECASE), lambda match: '</script><img src'),
|
||||
(re.compile(r'.jpg";', re.DOTALL|re.IGNORECASE), lambda match: '.jpg">'),
|
||||
(re.compile(r'var SITIO = "elcorreo";', re.DOTALL|re.IGNORECASE), lambda match: '<SCRIPT TYPE="text/JavaScript"'),
|
||||
|
||||
# To separate paragraphs with a blank line
|
||||
(re.compile(r'<div class="p"', re.DOTALL|re.IGNORECASE), lambda match: '<p></p><div class="p"'),
|
||||
|
||||
# To put a blank line between the subtitle and the date and time of the news
|
||||
(re.compile(r'<div class="date">', re.DOTALL|re.IGNORECASE), lambda match: '<br><div class="date">'),
|
||||
|
||||
# To put a blank line between the intro of the embedded videos and the previous text
|
||||
(re.compile(r'<div class="video"', re.DOTALL|re.IGNORECASE), lambda match: '<br><div class="video"'),
|
||||
|
||||
# To view photos from the first when these are presented as a gallery
|
||||
(re.compile(r'src="/img/shim.gif"', re.DOTALL|re.IGNORECASE), lambda match: ''),
|
||||
(re.compile(r'rel=', re.DOTALL|re.IGNORECASE), lambda match: 'src='),
|
||||
|
||||
# To remove the link of the title
|
||||
(re.compile(r'<h1 class="headline">\n<a href="', re.DOTALL|re.IGNORECASE), lambda match: '<h1 class="'),
|
||||
(re.compile(r'</a>\n</h1>', re.DOTALL|re.IGNORECASE), lambda match: '</h1>'),
|
||||
|
||||
]
|
||||
|
@ -9,13 +9,14 @@ __docformat__ = 'restructuredtext en'
|
||||
elpais.es
|
||||
'''
|
||||
|
||||
from time import strftime
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class ElPais(BasicNewsRecipe):
|
||||
__author__ = 'Kovid Goyal & Lorenzo Vigentini & Jordi Balcells'
|
||||
description = 'Main daily newspaper from Spain'
|
||||
|
||||
cover_url = 'http://www.elpais.com/im/tit_logo_global.gif'
|
||||
title = u'El Pais'
|
||||
publisher = u'Ediciones El Pa\xeds SL'
|
||||
category = 'News, politics, culture, economy, general interest'
|
||||
@ -62,6 +63,6 @@ class ElPais(BasicNewsRecipe):
|
||||
(u'Vi\xf1etas', u'http://www.elpais.com/rss/feed.html?feedId=17058')
|
||||
]
|
||||
|
||||
def print_version(self, url):
|
||||
url = url+'?print=1'
|
||||
return url
|
||||
def get_cover_url(self):
|
||||
return 'http://img5.kiosko.net/' + strftime("%Y/%m/%d") + '/es/elpais.750.jpg'
|
||||
|
||||
|
43
resources/recipes/el_publico.recipe
Normal file
43
resources/recipes/el_publico.recipe
Normal file
@ -0,0 +1,43 @@
|
||||
#!/usr/bin/env python
|
||||
__license__ = 'GPL v3'
|
||||
__author__ = 'Gerardo Diez'
|
||||
__copyright__ = 'Gerardo Diez<gerardo.diez.garcia@gmail.com>'
|
||||
description = 'Main daily newspaper from Spain - v1.00 (05, Enero 2011)'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
'''
|
||||
publico.es
|
||||
'''
|
||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||
class Publico(BasicNewsRecipe):
|
||||
title =u'Publico.es'
|
||||
__author__ ='Gerardo Diez'
|
||||
publisher =u'Mediapubli Sociedad de Publicaciones y Ediciones S.L.'
|
||||
category ='news, politics, finances, world, spain, science, catalunya'
|
||||
oldest_article =1
|
||||
max_articles_per_feed =100
|
||||
simultaneous_downloads =10
|
||||
cover_url =u'http://imagenes.publico.es/css/img/logo_publico.gif'
|
||||
timefmt ='[%a, %d %b, %Y]'
|
||||
encoding ='utf8'
|
||||
language ='es'
|
||||
remove_javascript =True
|
||||
no_stylesheets =True
|
||||
keep_only_tags =dict(id='main')
|
||||
remove_tags =[
|
||||
dict(name='div', attrs={'class':['Noticias_642x50', 'contInfo ancho']}),
|
||||
dict(name='ul', attrs={'class':['navComentarios', 'comentarios']}),
|
||||
dict(name='div', attrs={'id':['commentsContext', 'toolbar', 'comentarios']}),
|
||||
dict(name='h5', attrs={'id':'comentarios'})
|
||||
]
|
||||
feeds =[(u'Internacional', u'http://www.publico.es/estaticos/rss/internacional'),
|
||||
(u'Espa\xf1a', u'http://www.publico.es/estaticos/rss/espana'),
|
||||
(u'Dinero', u'http://www.publico.es/estaticos/rss/dinero'),
|
||||
(u'Ciencias', u'http://www.publico.es/estaticos/rss/ciencias'),
|
||||
(u'Culturas', u'http://www.publico.es/estaticos/rss/culturas'),
|
||||
(u'Deportes', u'http://www.publico.es/estaticos/rss/deportes'),
|
||||
(u'Televisi\xf3n y Gente', u'http://www.publico.es/estaticos/rss/televisionygente'),
|
||||
(u'Catalu\xf1a', u'http://www.publico.es/estaticos/rss/catalunya'),
|
||||
(u'Viajes', u'http://www.publico.es/estaticos/rss/viajes')]
|
||||
|
||||
|
@ -17,7 +17,7 @@ class ElPais_RSS(BasicNewsRecipe):
|
||||
no_stylesheets = True
|
||||
encoding = 'cp1252'
|
||||
use_embedded_content = False
|
||||
language = 'es_ES'
|
||||
language = 'es'
|
||||
remove_empty_feeds = True
|
||||
publication_type = 'newspaper'
|
||||
masthead_url = 'http://www.elpais.com/im/tit_logo.gif'
|
||||
|
@ -1,7 +1,5 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
|
||||
__copyright__ = '2009-2011, Darko Miletic <darko.miletic at gmail.com>'
|
||||
'''
|
||||
exiledonline.com
|
||||
'''
|
||||
@ -21,17 +19,19 @@ class Exiled(BasicNewsRecipe):
|
||||
encoding = 'utf8'
|
||||
remove_javascript = True
|
||||
language = 'en'
|
||||
publication_type = 'newsblog'
|
||||
masthead_url = 'http://exiledonline.com/wp-content/themes/exiledonline_theme/images/header-sm.gif'
|
||||
extra_css = """
|
||||
body{font-family: Arial,Helvetica,sans-serif}
|
||||
#topslug{font-size: xx-large; font-weight: bold; color: red}
|
||||
"""
|
||||
|
||||
cover_url = 'http://exiledonline.com/wp-content/themes/exiledonline_theme/images/header-sm.gif'
|
||||
|
||||
html2lrf_options = [
|
||||
'--comment' , description
|
||||
, '--base-font-size', '10'
|
||||
, '--category' , category
|
||||
, '--publisher' , publisher
|
||||
]
|
||||
|
||||
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
|
||||
conversion_options = {
|
||||
'comment' : description
|
||||
, 'tags' : category
|
||||
, 'publisher' : publisher
|
||||
, 'language' : language
|
||||
}
|
||||
|
||||
keep_only_tags = [dict(name='div', attrs={'id':'main'})]
|
||||
|
||||
@ -47,12 +47,13 @@ class Exiled(BasicNewsRecipe):
|
||||
def preprocess_html(self, soup):
|
||||
for item in soup.findAll(style=True):
|
||||
del item['style']
|
||||
mtag = '\n<meta http-equiv="Content-Language" content="en"/>\n<meta http-equiv="Content-Type" content="text/html; charset=utf-8">\n'
|
||||
soup.head.insert(0,mtag)
|
||||
for alink in soup.findAll('a'):
|
||||
if alink.string is not None:
|
||||
tstr = alink.string
|
||||
alink.replaceWith(tstr)
|
||||
return soup
|
||||
|
||||
def get_article_url(self, article):
|
||||
raw = article.get('link', None)
|
||||
final = raw + 'all/1/'
|
||||
return final
|
||||
|
||||
|
@ -1,59 +1,79 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
|
||||
'''
|
||||
www.expansion.com
|
||||
'''
|
||||
__author__ = 'Gerardo Diez'
|
||||
__copyright__ = 'Gerardo Diez<gerardo.diez.garcia@gmail.com>'
|
||||
description = 'Main daily newspaper from Spain - v1.00 (05, Enero 2011)'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
from calibre.ebooks.BeautifulSoup import Tag
|
||||
|
||||
class Expansion(BasicNewsRecipe):
|
||||
title = 'Diario Expansion'
|
||||
__author__ = 'Darko Miletic'
|
||||
description = 'Lider de informacion de mercados, economica y politica'
|
||||
publisher = 'expansion.com'
|
||||
category = 'news, politics, Spain'
|
||||
oldest_article = 2
|
||||
'''
|
||||
expansion.es
|
||||
'''
|
||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||
class Publico(BasicNewsRecipe):
|
||||
title =u'Expansion.com'
|
||||
__author__ ='Gerardo Diez'
|
||||
publisher =u'Unidad Editorial Información Económica, S.L.'
|
||||
category ='finances, catalunya'
|
||||
oldest_article =1
|
||||
max_articles_per_feed =100
|
||||
no_stylesheets = True
|
||||
use_embedded_content = False
|
||||
delay = 1
|
||||
encoding = 'iso-8859-15'
|
||||
simultaneous_downloads =10
|
||||
cover_url =u'http://estaticos01.expansion.com/iconos/v2.x/v2.0/cabeceras/logo_expansion.png'
|
||||
timefmt ='[%A, %d %B, %Y]'
|
||||
encoding ='latin'
|
||||
language ='es'
|
||||
|
||||
direction = 'ltr'
|
||||
|
||||
html2lrf_options = [
|
||||
'--comment' , description
|
||||
, '--category' , category
|
||||
, '--publisher', publisher
|
||||
]
|
||||
|
||||
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
|
||||
|
||||
feeds = [
|
||||
(u'Ultimas noticias', u'http://rss.expansion.com/rss/descarga.htm?data2=178')
|
||||
,(u'Temas del dia' , u'http://rss.expansion.com/rss/descarga.htm?data2=178')
|
||||
]
|
||||
|
||||
|
||||
keep_only_tags = [dict(name='div', attrs={'id':'principal'})]
|
||||
|
||||
remove_javascript =True
|
||||
no_stylesheets =True
|
||||
keep_only_tags =dict(name='div', attrs={'class':['noticia primer_elemento']})
|
||||
remove_tags =[
|
||||
dict(name=['object','link','script'])
|
||||
,dict(name='div', attrs={'class':['utilidades','tit_relacionadas']})
|
||||
dict(name='div', attrs={'class':['compartir', 'metadata_desarrollo_noticia', 'relacionadas', 'mas_info','publicidad publicidad_textlink', 'ampliarfoto']}),
|
||||
dict(name='ul', attrs={'class':['bolos_desarrollo_noticia']}),
|
||||
dict(name='span', attrs={'class':['comentarios']}),
|
||||
dict(name='p', attrs={'class':['cintillo_comentarios', 'cintillo_comentarios formulario']}),
|
||||
dict(name='div', attrs={'id':['comentarios_lectores_listado']})
|
||||
]
|
||||
feeds =[
|
||||
(u'Portada', u'http://estaticos.expansion.com/rss/portada.xml'),
|
||||
(u'Portada: Bolsas', u'http://estaticos.expansion.com/rss/mercados.xml'),
|
||||
(u'Divisas', u'http://estaticos.expansion.com/rss/mercadosdivisas.xml'),
|
||||
(u'Euribor', u'http://estaticos.expansion.com/rss/mercadoseuribor.xml'),
|
||||
(u'Materias Primas', u'http://estaticos.expansion.com/rss/mercadosmateriasprimas.xml'),
|
||||
(u'Renta Fija', u'http://estaticos.expansion.com/rss/mercadosrentafija.xml'),
|
||||
|
||||
(u'Portada: Mi Dinero', u'http://estaticos.expansion.com/rss/midinero.xml'),
|
||||
(u'Hipotecas', u'http://estaticos.expansion.com/rss/midinerohipotecas.xml'),
|
||||
(u'Créditos', u'http://estaticos.expansion.com/rss/midinerocreditos.xml'),
|
||||
(u'Pensiones', u'http://estaticos.expansion.com/rss/midineropensiones.xml'),
|
||||
(u'Fondos de Inversión', u'http://estaticos.expansion.com/rss/midinerofondos.xml'),
|
||||
(u'Motor', u'http://estaticos.expansion.com/rss/midineromotor.xml'),
|
||||
|
||||
(u'Portada: Empresas', u'http://estaticos.expansion.com/rss/empresas.xml'),
|
||||
(u'Banca', u'http://estaticos.expansion.com/rss/empresasbanca.xml'),
|
||||
(u'TMT', u'http://estaticos.expansion.com/rss/empresastmt.xml'),
|
||||
(u'Energía', u'http://estaticos.expansion.com/rss/empresasenergia.xml'),
|
||||
(u'Inmobiliario y Construcción', u'http://estaticos.expansion.com/rss/empresasinmobiliario.xml'),
|
||||
(u'Transporte y Turismo', u'http://estaticos.expansion.com/rss/empresastransporte.xml'),
|
||||
(u'Automoción e Industria', u'http://estaticos.expansion.com/rss/empresasauto-industria.xml'),
|
||||
(u'Distribución', u'http://estaticos.expansion.com/rss/empresasdistribucion.xml'),
|
||||
(u'Deporte y Negocio', u' http://estaticos.expansion.com/rss/empresasdeporte.xml'),
|
||||
(u'Mi Negocio', u'http://estaticos.expansion.com/rss/empresasminegocio.xml'),
|
||||
(u'Interiores', u'http://estaticos.expansion.com/rss/empresasinteriores.xml'),
|
||||
(u'Digitech', u'http://estaticos.expansion.com/rss/empresasdigitech.xml'),
|
||||
|
||||
(u'Portada: Economía y Política', u'http://estaticos.expansion.com/rss/economiapolitica.xml'),
|
||||
(u'Política', u'http://estaticos.expansion.com/rss/economia.xml'),
|
||||
(u'Portada: Sociedad', u'http://estaticos.expansion.com/rss/entorno.xml'),
|
||||
|
||||
(u'Portada: Opinión', u'http://estaticos.expansion.com/rss/opinion.xml'),
|
||||
(u'Llaves y editoriales', u'http://estaticos.expansion.com/rss/opinioneditorialyllaves.xml'),
|
||||
(u'Tribunas', u'http://estaticos.expansion.com/rss/opiniontribunas.xml'),
|
||||
|
||||
(u'Portada: Jurídico', u'http://estaticos.expansion.com/rss/juridico.xml'),
|
||||
(u'Entrevistas', u'http://estaticos.expansion.com/rss/juridicoentrevistas.xml'),
|
||||
(u'Opinión', u'http://estaticos.expansion.com/rss/juridicoopinion.xml'),
|
||||
(u'Sentencias', u'http://estaticos.expansion.com/rss/juridicosentencias.xml'),
|
||||
|
||||
(u'Mujer', u'http://estaticos.expansion.com/rss/mujer-empresa.xml'),
|
||||
(u'Cataluña', u'http://estaticos.expansion.com/rss/catalunya.xml'),
|
||||
(u'Función pública', u'http://estaticos.expansion.com/rss/funcion-publica.xml')
|
||||
]
|
||||
|
||||
remove_tags_after = [dict(name='div', attrs={'class':'tit_relacionadas'})]
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
soup.html['dir' ] = self.direction
|
||||
mcharset = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset=utf-8")])
|
||||
soup.head.insert(0,mcharset)
|
||||
for item in soup.findAll(style=True):
|
||||
del item['style']
|
||||
return soup
|
||||
|
||||
|
@ -1,5 +1,5 @@
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>'
|
||||
__copyright__ = '2010-2011, Darko Miletic <darko.miletic at gmail.com>'
|
||||
'''
|
||||
ft.com
|
||||
'''
|
||||
@ -52,12 +52,9 @@ class FinancialTimes(BasicNewsRecipe):
|
||||
.copyright{font-size: x-small}
|
||||
"""
|
||||
|
||||
def parse_index(self):
|
||||
def get_artlinks(self, elem):
|
||||
articles = []
|
||||
soup = self.index_to_soup(self.INDEX)
|
||||
wide = soup.find('div',attrs={'class':'wide'})
|
||||
if wide:
|
||||
for item in wide.findAll('a',href=True):
|
||||
for item in elem.findAll('a',href=True):
|
||||
url = self.PREFIX + item['href']
|
||||
title = self.tag_to_string(item)
|
||||
date = strftime(self.timefmt)
|
||||
@ -67,7 +64,26 @@ class FinancialTimes(BasicNewsRecipe):
|
||||
,'url' :url
|
||||
,'description':''
|
||||
})
|
||||
return [('FT UK edition',articles)]
|
||||
return articles
|
||||
|
||||
def parse_index(self):
|
||||
feeds = []
|
||||
soup = self.index_to_soup(self.INDEX)
|
||||
wide = soup.find('div',attrs={'class':'wide'})
|
||||
if not wide:
|
||||
return feeds
|
||||
strest = wide.findAll('h3', attrs={'class':'section'})
|
||||
if not strest:
|
||||
return feeds
|
||||
st = wide.find('h4',attrs={'class':'section-no-arrow'})
|
||||
if st:
|
||||
strest.insert(0,st)
|
||||
for item in strest:
|
||||
ftitle = self.tag_to_string(item)
|
||||
self.report_progress(0, _('Fetching feed')+' %s...'%(ftitle))
|
||||
feedarts = self.get_artlinks(item.parent.ul)
|
||||
feeds.append((ftitle,feedarts))
|
||||
return feeds
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
return self.adeify_images(soup)
|
||||
|
@ -1,4 +1,5 @@
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
from calibre.ebooks.BeautifulSoup import Tag
|
||||
import re
|
||||
|
||||
class NatureNews(BasicNewsRecipe):
|
||||
@ -10,17 +11,76 @@ class NatureNews(BasicNewsRecipe):
|
||||
max_articles_per_feed = 50
|
||||
|
||||
no_stylesheets = True
|
||||
remove_tags_before = dict(name='h1', attrs={'class':'heading entry-title'})
|
||||
remove_tags_after = dict(name='h2', attrs={'id':'comments'})
|
||||
keep_only_tags = [dict(name='div', attrs={'id':'content'})]
|
||||
# remove_tags_before = dict(name='h1', attrs={'class':'heading entry-title'})
|
||||
# remove_tags_after = dict(name='h2', attrs={'id':'comments'})
|
||||
remove_tags = [
|
||||
dict(name='h2', attrs={'id':'comments'}),
|
||||
dict(attrs={'alt':'Advertisement'}),
|
||||
dict(name='div', attrs={'class':'ad'}),
|
||||
dict(attrs={'class':'Z3988'}),
|
||||
dict(attrs={'class':['formatpublished','type-of-article','cleardiv','disclaimer','buttons','comments xoxo']}),
|
||||
dict(name='a', attrs={'href':'#comments'}),
|
||||
dict(name='h2',attrs={'class':'subheading plusicon icon-add-comment'})
|
||||
]
|
||||
|
||||
preprocess_regexps = [
|
||||
(re.compile(r'<p>ADVERTISEMENT</p>', re.DOTALL|re.IGNORECASE), lambda match: ''),
|
||||
]
|
||||
|
||||
extra_css = '''
|
||||
.author { text-align: right; font-size: small; line-height:1em; margin-top:0px; margin-left:0; margin-right:0; margin-bottom: 0; }
|
||||
.imagedescription { font-size: small; font-style:italic; line-height:1em; margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; }
|
||||
.imagecredit { font-size: x-small; font-style: normal; font-weight: bold}
|
||||
'''
|
||||
|
||||
feeds = [('Nature News', 'http://feeds.nature.com/news/rss/most_recent')]
|
||||
|
||||
def preprocess_html(self,soup):
|
||||
# The author name is slightly buried - dig it up
|
||||
author = soup.find('p', {'class':'byline'})
|
||||
if author:
|
||||
# Find out the author's name
|
||||
authornamediv = author.find('span',{'class':'author fn'})
|
||||
authornamelink = authornamediv.find('a')
|
||||
if authornamelink:
|
||||
authorname = authornamelink.contents[0]
|
||||
else:
|
||||
authorname = authornamediv.contents[0]
|
||||
# Stick the author's name in the byline tag
|
||||
tag = Tag(soup,'div')
|
||||
tag['class'] = 'author'
|
||||
tag.insert(0,authorname.strip())
|
||||
author.replaceWith(tag)
|
||||
|
||||
# Change the intro from a p to a div
|
||||
intro = soup.find('p',{'class':'intro'})
|
||||
if intro:
|
||||
tag = Tag(soup,'div')
|
||||
tag['class'] = 'intro'
|
||||
tag.insert(0,intro.contents[0])
|
||||
intro.replaceWith(tag)
|
||||
|
||||
# Change span class=imagedescription to div
|
||||
descr = soup.find('span',{'class':'imagedescription'})
|
||||
if descr:
|
||||
tag = Tag(soup,'div')
|
||||
tag['class'] = 'imagedescription'
|
||||
tag.insert(0,descr.renderContents())
|
||||
descr.replaceWith(tag)
|
||||
|
||||
# The references are in a list, let's make them simpler
|
||||
reflistcont = soup.find('ul',{'id':'article-refrences'})
|
||||
if reflistcont:
|
||||
reflist = reflistcont.li.renderContents()
|
||||
tag = Tag(soup,'div')
|
||||
tag['class'] = 'article-references'
|
||||
tag.insert(0,reflist)
|
||||
reflistcont.replaceWith(tag)
|
||||
|
||||
# Within the id=content div, we need to remove all the stuff after the end of the class=entry-content
|
||||
entrycontent = soup.find('div',{'class':'entry-content'})
|
||||
for nextSibling in entrycontent.findNextSiblings():
|
||||
nextSibling.extract()
|
||||
|
||||
return soup
|
||||
|
@ -8,12 +8,13 @@ __docformat__ = 'restructuredtext en'
|
||||
globeandmail.com
|
||||
'''
|
||||
|
||||
import re
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class AdvancedUserRecipe1287083651(BasicNewsRecipe):
|
||||
title = u'Globe & Mail'
|
||||
__license__ = 'GPL v3'
|
||||
__author__ = 'Szing'
|
||||
__author__ = 'Kovid Goyal'
|
||||
oldest_article = 2
|
||||
no_stylesheets = True
|
||||
max_articles_per_feed = 100
|
||||
@ -38,24 +39,19 @@ class AdvancedUserRecipe1287083651(BasicNewsRecipe):
|
||||
(u'Sports', u'http://www.theglobeandmail.com/auto/?service=rss')
|
||||
]
|
||||
|
||||
keep_only_tags = [
|
||||
dict(name='h1'),
|
||||
dict(name='h2', attrs={'id':'articletitle'}),
|
||||
dict(name='p', attrs={'class':['leadText', 'meta', 'leadImage', 'redtext byline', 'bodyText']}),
|
||||
dict(name='div', attrs={'class':['news','articlemeta','articlecopy']}),
|
||||
dict(name='id', attrs={'class':'article'}),
|
||||
dict(name='table', attrs={'class':'todays-market'}),
|
||||
dict(name='header', attrs={'id':'leadheader'})
|
||||
preprocess_regexps = [
|
||||
(re.compile(r'<head.*?</head>', re.DOTALL), lambda m: '<head></head>'),
|
||||
(re.compile(r'<script.*?</script>', re.DOTALL), lambda m: ''),
|
||||
]
|
||||
|
||||
remove_tags_before = dict(name='h1')
|
||||
remove_tags = [
|
||||
dict(name='div', attrs={'id':['tabInside', 'ShareArticles', 'topStories']})
|
||||
]
|
||||
|
||||
#this has to be here or the text in the article appears twice.
|
||||
remove_tags_after = [dict(id='article')]
|
||||
dict(name='div', attrs={'id':['ShareArticles', 'topStories']}),
|
||||
dict(href=lambda x: x and 'tracking=' in x),
|
||||
{'class':['articleTools', 'pagination', 'Ads', 'topad',
|
||||
'breadcrumbs', 'footerNav', 'footerUtil', 'downloadlinks']}]
|
||||
|
||||
#Use the mobile version rather than the web version
|
||||
def print_version(self, url):
|
||||
return url + '&service=mobile'
|
||||
return url.rpartition('?')[0] + '?service=mobile'
|
||||
|
||||
|
64
resources/recipes/gulfnews.recipe
Normal file
64
resources/recipes/gulfnews.recipe
Normal file
@ -0,0 +1,64 @@
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2011, Darko Miletic <darko.miletic at gmail.com>'
|
||||
'''
|
||||
gulfnews.com
|
||||
'''
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class GulfNews(BasicNewsRecipe):
|
||||
title = 'Gulf News'
|
||||
__author__ = 'Darko Miletic'
|
||||
description = 'News from United Arab Emirrates, persian gulf and rest of the world'
|
||||
publisher = 'Al Nisr Publishing LLC'
|
||||
category = 'news, politics, UAE, world'
|
||||
oldest_article = 2
|
||||
max_articles_per_feed = 200
|
||||
no_stylesheets = True
|
||||
encoding = 'utf8'
|
||||
use_embedded_content = False
|
||||
language = 'en'
|
||||
remove_empty_feeds = True
|
||||
publication_type = 'newsportal'
|
||||
masthead_url = 'http://gulfnews.com/media/img/gulf_news_logo.jpg'
|
||||
extra_css = """
|
||||
body{font-family: Arial,Helvetica,sans-serif }
|
||||
img{margin-bottom: 0.4em; display:block}
|
||||
h1{font-family: Georgia, 'Times New Roman', Times, serif}
|
||||
ol,ul{list-style: none}
|
||||
.synopsis{font-size: small}
|
||||
.details{font-size: x-small}
|
||||
.image{font-size: xx-small}
|
||||
"""
|
||||
|
||||
conversion_options = {
|
||||
'comment' : description
|
||||
, 'tags' : category
|
||||
, 'publisher' : publisher
|
||||
, 'language' : language
|
||||
}
|
||||
|
||||
remove_tags = [
|
||||
dict(name=['meta','link','object','embed'])
|
||||
,dict(attrs={'class':['quickLinks','ratings']})
|
||||
,dict(attrs={'id':'imageSelector'})
|
||||
]
|
||||
remove_attributes=['lang']
|
||||
keep_only_tags=[
|
||||
dict(name='h1')
|
||||
,dict(attrs={'class':['synopsis','details','image','article']})
|
||||
]
|
||||
|
||||
|
||||
feeds = [
|
||||
(u'UAE News' , u'http://gulfnews.com/cmlink/1.446094')
|
||||
,(u'Business' , u'http://gulfnews.com/cmlink/1.446098')
|
||||
,(u'Entertainment' , u'http://gulfnews.com/cmlink/1.446095')
|
||||
,(u'Sport' , u'http://gulfnews.com/cmlink/1.446096')
|
||||
,(u'Life' , u'http://gulfnews.com/cmlink/1.446097')
|
||||
]
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
for item in soup.findAll(style=True):
|
||||
del item['style']
|
||||
return soup
|
@ -3,13 +3,14 @@ __license__ = 'GPL v3'
|
||||
__copyright__ = '04 December 2010, desUBIKado'
|
||||
__author__ = 'desUBIKado'
|
||||
__description__ = 'Daily newspaper from Aragon'
|
||||
__version__ = 'v0.03'
|
||||
__date__ = '11, December 2010'
|
||||
__version__ = 'v0.04'
|
||||
__date__ = '6, Januery 2011'
|
||||
'''
|
||||
[url]http://www.heraldo.es/[/url]
|
||||
'''
|
||||
|
||||
import time
|
||||
import re
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class heraldo(BasicNewsRecipe):
|
||||
@ -20,12 +21,13 @@ class heraldo(BasicNewsRecipe):
|
||||
category = 'News, politics, culture, economy, general interest'
|
||||
language = 'es'
|
||||
timefmt = '[%a, %d %b, %Y]'
|
||||
oldest_article = 1
|
||||
oldest_article = 2
|
||||
delay = 1
|
||||
max_articles_per_feed = 100
|
||||
use_embedded_content = False
|
||||
remove_javascript = True
|
||||
no_stylesheets = True
|
||||
recursion = 10
|
||||
|
||||
|
||||
feeds = [
|
||||
(u'Portadas', u'http://www.heraldo.es/index.php/mod.portadas/mem.rss')
|
||||
@ -37,7 +39,8 @@ class heraldo(BasicNewsRecipe):
|
||||
|
||||
remove_tags = [dict(name='a', attrs={'class':['com flo-r','enl-if','enl-df']}),
|
||||
dict(name='div', attrs={'class':['brb-b-s con marg-btt','cnt-rel con']}),
|
||||
dict(name='form', attrs={'class':'form'})]
|
||||
dict(name='form', attrs={'class':'form'}),
|
||||
dict(name='ul', attrs={'id':['cont-tags','pag-1']})]
|
||||
|
||||
remove_tags_before = dict(name='div' , attrs={'id':'dts'})
|
||||
remove_tags_after = dict(name='div' , attrs={'id':'com'})
|
||||
@ -59,7 +62,16 @@ class heraldo(BasicNewsRecipe):
|
||||
return cover
|
||||
|
||||
|
||||
|
||||
extra_css = '''
|
||||
h2{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:xx-large;}
|
||||
.con strong{font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:16px;}
|
||||
.con h2{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:30px;}
|
||||
.con span{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:12px;}
|
||||
.ent {font-family:Arial,Helvetica,sans-serif; font-weight:normal; font-style:italic; font-size:18px;}
|
||||
img{margin-bottom: 0.4em}
|
||||
'''
|
||||
|
||||
preprocess_regexps = [
|
||||
|
||||
# To separate the comments with a blank line
|
||||
(re.compile(r'<div id="com"', re.DOTALL|re.IGNORECASE), lambda match: '<br><div id="com"')
|
||||
]
|
||||
|
@ -5,6 +5,7 @@ class AdvancedUserRecipe1293122276(BasicNewsRecipe):
|
||||
__author__ = 'Jack Mason'
|
||||
author = 'IBM Global Business Services'
|
||||
publisher = 'IBM'
|
||||
language = 'en'
|
||||
category = 'news, technology, IT, internet of things, analytics'
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 30
|
||||
|
182
resources/recipes/ihned.recipe
Normal file
182
resources/recipes/ihned.recipe
Normal file
@ -0,0 +1,182 @@
|
||||
import re, time
|
||||
from calibre import strftime
|
||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||
|
||||
class IHNed(BasicNewsRecipe):
|
||||
|
||||
|
||||
stahnout_vsechny = True
|
||||
#True = stahuje vsechny z homepage
|
||||
#False = stahuje pouze dnesni clanky (ze dne, kdy je skript spusten)
|
||||
|
||||
title = 'iHNed'
|
||||
__author__ = 'Karel Bílek'
|
||||
language = 'cs'
|
||||
description = 'Zprávy z iHNed.cz'
|
||||
timefmt = ' [%a, %d %b, %Y]'
|
||||
needs_subscription = False
|
||||
remove_tags = [dict(attrs={'class':['borderbottom', 'web', 'foot', 'reklama', 'd-elm d-rellinks', 'd-elm']}),
|
||||
dict(style=['text-align: center;']),
|
||||
dict(id=['r-bfull']),
|
||||
dict(name=['script', 'noscript', 'style'])]
|
||||
encoding = 'windows-1250'
|
||||
no_stylesheets = True
|
||||
remove_tags_before = dict(attrs={'class':'d-nadtit'})
|
||||
remove_tags_after = dict(attrs={'class':'like'})
|
||||
|
||||
conversion_options = {
|
||||
'linearize_tables' : True,
|
||||
}
|
||||
|
||||
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
|
||||
def makeurl(wat):
|
||||
return "http://ihned.cz"+wat;
|
||||
|
||||
for h1 in soup.findAll('h1'):
|
||||
a = h1.find('a')
|
||||
if a:
|
||||
string = a.string
|
||||
if string:
|
||||
soup.a.replaceWith(string)
|
||||
for a in soup.findAll('a', href=True) :
|
||||
cil = str(a['href'])
|
||||
if cil.startswith("/") or cil.startswith("index"):
|
||||
a['href'] = makeurl(cil)
|
||||
return soup
|
||||
|
||||
|
||||
def parse_index(self):
|
||||
|
||||
def makeurl(wat):
|
||||
if wat.startswith("/") or wat.startswith("index"):
|
||||
return "http://ihned.cz"+wat;
|
||||
else:
|
||||
return wat
|
||||
|
||||
|
||||
articles = {} #vysledek, asi
|
||||
key = None #soucasna sekce
|
||||
ans = [] #vsechny sekce
|
||||
|
||||
articles["Hlavní"] = []
|
||||
ans.append("Hlavní")
|
||||
|
||||
was = {}
|
||||
|
||||
def parse_subpage(url, name):
|
||||
articles[name] = []
|
||||
ans.append(name)
|
||||
|
||||
|
||||
soup = self.index_to_soup(url)
|
||||
otvirak = soup.find(True, attrs={'class':['otv']})
|
||||
if otvirak:
|
||||
|
||||
#the code is copypasted here because I don't know python. simple as that.
|
||||
a = otvirak.find('a', href=True)
|
||||
title = self.tag_to_string(a, use_alt=True).strip()
|
||||
txt = otvirak.find(True, attrs={'class':['txt']})
|
||||
description = ''
|
||||
if txt:
|
||||
match = re.match(r'<div class="txt">\s*([^<]*)\s*<a', str(txt), re.L)
|
||||
if match:
|
||||
description = match.group(1)
|
||||
|
||||
pubdate = strftime('%d. %m.')
|
||||
if not title in was:
|
||||
articles[name].append(
|
||||
dict(title=title, url=makeurl(a['href']), date=pubdate,
|
||||
description=description,
|
||||
content=''))
|
||||
|
||||
otv234 = soup.find(True, attrs={'class':['otv234', 'col2a']})
|
||||
if otv234:
|
||||
for ow in otv234.findAll(True, attrs={'class':['ow']}):
|
||||
a = ow.find('a', href=True)
|
||||
title = self.tag_to_string(a, use_alt=True).strip()
|
||||
description=''
|
||||
prx = ow.find(True, attrs={'class':['prx']});
|
||||
if prx:
|
||||
description = str(prx.string)
|
||||
nfo = ow.find(True, attrs={'class':['nfo']});
|
||||
pubdate = ''
|
||||
if nfo:
|
||||
dtime = time.localtime();
|
||||
day = dtime[2]
|
||||
month = dtime[1]
|
||||
|
||||
pubdate = strftime('%d. %m.')
|
||||
|
||||
match = re.search(r'([0-9]*)\.([0-9]*)\.', str(nfo))
|
||||
|
||||
if self.stahnout_vsechny or (int(day) == int(match.group(1)) and int(month) == int(match.group(2))):
|
||||
if not title in was:
|
||||
articles[name].append(
|
||||
dict(title=title, url=makeurl(a['href']), date=pubdate,
|
||||
description=description,
|
||||
content=''))
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
soup = self.index_to_soup('http://ihned.cz/')
|
||||
otvirak = soup.find(True, attrs={'class':['otv']})
|
||||
if otvirak:
|
||||
a = otvirak.find('a', href=True)
|
||||
title = self.tag_to_string(a, use_alt=True).strip()
|
||||
txt = otvirak.find(True, attrs={'class':['txt']})
|
||||
description = ''
|
||||
if txt:
|
||||
match = re.match(r'<div class="txt">\s*([^<]*)\s*<a', str(txt), re.L)
|
||||
if match:
|
||||
description = match.group(1)
|
||||
|
||||
pubdate = strftime('%d. %m.')
|
||||
feed = "Hlavní"
|
||||
articles[feed].append(
|
||||
dict(title=title, url=(a['href']), date=pubdate,
|
||||
description=description,
|
||||
content=''))
|
||||
was[title]=1
|
||||
|
||||
otvirak2345 = soup.find(True, attrs={'class':['otv2345']})
|
||||
if otvirak2345:
|
||||
for otv2 in otvirak2345.findAll(True, attrs={'class':['otv2-5']}):
|
||||
a = otv2.find('a', attrs={'class':['tit2']}, href=True)
|
||||
title = self.tag_to_string(a, use_alt=True).strip()
|
||||
description=''
|
||||
span = otv2.find('span');
|
||||
if span:
|
||||
match = re.match(r'<span>\s*([^<]*)\s*<a', str(span), re.L)
|
||||
if match:
|
||||
description = match.group(1)
|
||||
feed = "Hlavní"
|
||||
pubdate = strftime('%d. %m.')
|
||||
articles[feed].append(
|
||||
dict(title=title, url=(a['href']), date=pubdate,
|
||||
description=description,
|
||||
content=''))
|
||||
was[title]=1
|
||||
|
||||
|
||||
parse_subpage("http://komentare.ihned.cz/", "Komentáře")
|
||||
parse_subpage("http://domaci.ihned.cz", "Domácí")
|
||||
parse_subpage("http://ekonomika.ihned.cz", "Ekonomika")
|
||||
parse_subpage("http://zahranicni.ihned.cz/", "Zahraničí");
|
||||
parse_subpage("http://finweb.ihned.cz/", "Finance");
|
||||
parse_subpage("http://digiweb.ihned.cz/", "DigiWeb");
|
||||
parse_subpage("http://kultura.ihned.cz/", "Kultura")
|
||||
parse_subpage("http://sport.ihned.cz/", "Sport");
|
||||
|
||||
#seradi kategorie
|
||||
ans = self.sort_index_by(ans, {'Hlavni':1, 'Domácí':2, 'Ekonomika':5, 'Zahraničí':3, 'Finance':6, 'DigiWeb':7, 'Kultura':8, 'Sport':9, 'Komentáře':4})
|
||||
|
||||
#vrati, ale pouze, kdyz je v kategoriich...
|
||||
ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
|
||||
return ans
|
||||
|
@ -6,6 +6,7 @@ class KANewsRecipe(BasicNewsRecipe):
|
||||
description = u'Nachrichten aus Karlsruhe, Deutschland und der Welt.'
|
||||
__author__ = 'tfeld'
|
||||
lang='de'
|
||||
language = 'de'
|
||||
no_stylesheets = True
|
||||
|
||||
oldest_article = 7
|
||||
|
17
resources/recipes/kath_net.recipe
Normal file
17
resources/recipes/kath_net.recipe
Normal file
@ -0,0 +1,17 @@
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class AdvancedUserRecipe1295262156(BasicNewsRecipe):
|
||||
title = u'kath.net'
|
||||
__author__ = 'Bobus'
|
||||
oldest_article = 7
|
||||
language = 'en'
|
||||
max_articles_per_feed = 100
|
||||
|
||||
feeds = [(u'kath.net', u'http://www.kath.net/2005/xml/index.xml')]
|
||||
|
||||
|
||||
def print_version(self, url):
|
||||
return url+"&print=yes"
|
||||
|
||||
extra_css = 'td.textb {font-size: medium;}'
|
||||
|
@ -3,12 +3,17 @@ from calibre.web.feeds.news import BasicNewsRecipe
|
||||
class AdvancedUserRecipe1274742400(BasicNewsRecipe):
|
||||
|
||||
title = u'Las Vegas Review Journal'
|
||||
__author__ = 'Joel'
|
||||
__author__ = 'Kovid Goyal'
|
||||
language = 'en'
|
||||
|
||||
oldest_article = 7
|
||||
|
||||
max_articles_per_feed = 100
|
||||
keep_only_tags = [dict(id='content-main')]
|
||||
remove_tags = [dict(id=['right-col-content', 'trending-topics']),
|
||||
{'class':['ppy-outer']}
|
||||
]
|
||||
no_stylesheets = True
|
||||
|
||||
feeds = [
|
||||
(u'News', u'http://www.lvrj.com/news.rss'),
|
||||
|
@ -20,8 +20,8 @@ class LaVanguardia(BasicNewsRecipe):
|
||||
max_articles_per_feed = 100
|
||||
no_stylesheets = True
|
||||
use_embedded_content = False
|
||||
delay = 1
|
||||
encoding = 'cp1252'
|
||||
delay = 5
|
||||
# encoding = 'cp1252'
|
||||
language = 'es'
|
||||
|
||||
direction = 'ltr'
|
||||
@ -35,7 +35,7 @@ class LaVanguardia(BasicNewsRecipe):
|
||||
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
|
||||
|
||||
feeds = [
|
||||
(u'Ciudadanos' , u'http://feeds.feedburner.com/lavanguardia/ciudadanos' )
|
||||
(u'Portada' , u'http://feeds.feedburner.com/lavanguardia/home' )
|
||||
,(u'Cultura' , u'http://feeds.feedburner.com/lavanguardia/cultura' )
|
||||
,(u'Deportes' , u'http://feeds.feedburner.com/lavanguardia/deportes' )
|
||||
,(u'Economia' , u'http://feeds.feedburner.com/lavanguardia/economia' )
|
||||
@ -45,17 +45,17 @@ class LaVanguardia(BasicNewsRecipe):
|
||||
,(u'Internet y tecnologia', u'http://feeds.feedburner.com/lavanguardia/internet' )
|
||||
,(u'Motor' , u'http://feeds.feedburner.com/lavanguardia/motor' )
|
||||
,(u'Politica' , u'http://feeds.feedburner.com/lavanguardia/politica' )
|
||||
,(u'Sucessos' , u'http://feeds.feedburner.com/lavanguardia/sucesos' )
|
||||
,(u'Sucesos' , u'http://feeds.feedburner.com/lavanguardia/sucesos' )
|
||||
]
|
||||
|
||||
|
||||
keep_only_tags = [
|
||||
dict(name='div', attrs={'class':'element1_3'})
|
||||
dict(name='div', attrs={'class':'detalle noticia'})
|
||||
]
|
||||
|
||||
remove_tags = [
|
||||
dict(name=['object','link','script'])
|
||||
,dict(name='div', attrs={'class':['colC','peu']})
|
||||
,dict(name='div', attrs={'class':['colC','peu','jstoolbar']})
|
||||
]
|
||||
|
||||
remove_tags_after = [dict(name='div', attrs={'class':'text'})]
|
||||
@ -67,4 +67,3 @@ class LaVanguardia(BasicNewsRecipe):
|
||||
for item in soup.findAll(style=True):
|
||||
del item['style']
|
||||
return soup
|
||||
|
||||
|
32
resources/recipes/mail_and_guardian.recipe
Normal file
32
resources/recipes/mail_and_guardian.recipe
Normal file
@ -0,0 +1,32 @@
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class AdvancedUserRecipe1295081935(BasicNewsRecipe):
|
||||
title = u'Mail & Guardian ZA News'
|
||||
__author__ = '77ja65'
|
||||
language = 'en'
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 30
|
||||
no_stylesheets = True
|
||||
masthead_url = 'http://c1608832.cdn.cloudfiles.rackspacecloud.com/mg_logo.gif'
|
||||
remove_tags_after = [dict(id='content')]
|
||||
|
||||
feeds = [
|
||||
(u'National News', u'http://www.mg.co.za/rss/national'),
|
||||
(u'Top Stories', u'http://www.mg.co.za/rss'),
|
||||
(u'Africa News', u'http://www.mg.co.za/rss/africa'),
|
||||
(u'Sport', u'http://www.mg.co.za/rss/sport'),
|
||||
(u'Business', u'http://www.mg.co.za/rss/business'),
|
||||
(u'And In Other News', u'http://www.mg.co.za/rss/and-in-other-news'),
|
||||
(u'World News', u'http://www.mg.co.za/rss/world')
|
||||
]
|
||||
|
||||
def print_version(self, url):
|
||||
return url.replace('http://www.mg.co.za/article/',
|
||||
'http://www.mg.co.za/printformat/single/')
|
||||
|
||||
extra_css = '''
|
||||
h1{font-family:Arial,Helvetica,sans-serif; font-
|
||||
weight:bold;font-size:large;}
|
||||
h2{font-family:Arial,Helvetica,sans-serif; font-
|
||||
weight:normal;font-size:small;}
|
||||
'''
|
@ -1,10 +1,9 @@
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>'
|
||||
__copyright__ = '2010-2011, Darko Miletic <darko.miletic at gmail.com>'
|
||||
'''
|
||||
msnbc.msn.com
|
||||
'''
|
||||
|
||||
import re
|
||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||
|
||||
class MsNBC(BasicNewsRecipe):
|
||||
@ -19,7 +18,16 @@ class MsNBC(BasicNewsRecipe):
|
||||
publisher = 'msnbc.com'
|
||||
category = 'news, USA, world'
|
||||
language = 'en'
|
||||
extra_css = ' body{ font-family: sans-serif } .head{font-family: serif; font-size: xx-large; font-weight: bold; color: #CC0000} .abstract{font-weight: bold} .source{font-size: small} .updateTime{font-size: small} '
|
||||
extra_css = """
|
||||
body{ font-family: Georgia,Times,serif }
|
||||
.hide{display: none}
|
||||
.caption{font-family: Arial,sans-serif; font-size: x-small}
|
||||
.entry-summary{font-family: Arial,sans-serif}
|
||||
.copyright{font-size: 0.95em; font-style: italic}
|
||||
.source-org{font-size: small; font-family: Arial,sans-serif}
|
||||
img{display: block; margin-bottom: 0.5em}
|
||||
span.byline{display: none}
|
||||
"""
|
||||
|
||||
conversion_options = {
|
||||
'comments' : description
|
||||
@ -28,14 +36,20 @@ class MsNBC(BasicNewsRecipe):
|
||||
,'publisher': publisher
|
||||
}
|
||||
|
||||
preprocess_regexps = [
|
||||
(re.compile(r'</style></head>', re.DOTALL|re.IGNORECASE),lambda match: '</style>')
|
||||
,(re.compile(r'<div class="head">', re.DOTALL|re.IGNORECASE),lambda match: '</head><body><div class="head">'),
|
||||
remove_tags_before = dict(name='h1', attrs={'id':'headline'})
|
||||
remove_tags_after = dict(name='span', attrs={'class':['copyright','Linear copyright']})
|
||||
keep_only_tags=[
|
||||
dict(attrs={'id':['headline','deck','byline','source','intelliTXT']})
|
||||
,dict(attrs={'class':['gl_headline','articleText','drawer-content Linear','v-center3','byline','textBodyBlack']})
|
||||
]
|
||||
remove_attributes=['property','lang','rel','xmlns:fb','xmlns:v','xmlns:dc','xmlns:dcmitype','xmlns:og','xmlns:media','xmlns:vcard','typeof','itemscope','itemtype','itemprop','about','type','size','width','height','onreadystatechange','data','border','hspace','vspace']
|
||||
|
||||
remove_tags = [
|
||||
dict(name=['iframe','object','link','embed','meta','table'])
|
||||
,dict(name='span', attrs={'class':['copyright','Linear copyright']})
|
||||
,dict(name='div', attrs={'class':'social'})
|
||||
]
|
||||
|
||||
remove_tags_before = dict(name='div', attrs={'class':'head'})
|
||||
remove_tags_after = dict(name='div', attrs={'class':'copyright'})
|
||||
remove_tags = [dict(name=['iframe','object','link','script','form'])]
|
||||
|
||||
feeds = [
|
||||
(u'US News' , u'http://rss.msnbc.msn.com/id/3032524/device/rss/rss.xml' )
|
||||
@ -48,11 +62,26 @@ class MsNBC(BasicNewsRecipe):
|
||||
,(u'Tech & Science', u'http://rss.msnbc.msn.com/id/3032117/device/rss/rss.xml' )
|
||||
]
|
||||
|
||||
def print_version(self, url):
|
||||
return url + 'print/1/displaymode/1098/'
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
for item in soup.head.findAll('div'):
|
||||
for item in soup.body.findAll('html'):
|
||||
item.name='div'
|
||||
for item in soup.body.findAll('div'):
|
||||
if item.has_key('id') and item['id'].startswith('vine-'):
|
||||
item.extract()
|
||||
if item.has_key('class') and ( item['class'].startswith('ad') or item['class'].startswith('vine')):
|
||||
item.extract()
|
||||
for item in soup.body.findAll('img'):
|
||||
if not item.has_key('alt'):
|
||||
item['alt'] = 'image'
|
||||
for item in soup.body.findAll('ol'):
|
||||
if item.has_key('class') and item['class'].startswith('grid'):
|
||||
item.extract()
|
||||
for item in soup.body.findAll('span'):
|
||||
if ( item.has_key('id') and item['id'].startswith('byLine') and item.string is None) or ( item.has_key('class') and item['class'].startswith('inline') ):
|
||||
item.extract()
|
||||
for alink in soup.findAll('a'):
|
||||
if alink.string is not None:
|
||||
tstr = alink.string
|
||||
alink.replaceWith(tstr)
|
||||
return soup
|
||||
|
||||
|
@ -10,6 +10,7 @@ import re
|
||||
class NationalGeographicNews(BasicNewsRecipe):
|
||||
title = u'National Geographic News'
|
||||
oldest_article = 7
|
||||
language = 'en'
|
||||
max_articles_per_feed = 100
|
||||
remove_javascript = True
|
||||
no_stylesheets = True
|
||||
|
@ -27,6 +27,9 @@ class NikkeiNet_sub_economy(BasicNewsRecipe):
|
||||
{'class':"JSID_basePageMove JSID_baseAsyncSubmit cmn-form_area JSID_optForm_utoken"},
|
||||
{'class':"cmn-article_keyword cmn-clearfix"},
|
||||
{'class':"cmn-print_headline cmn-clearfix"},
|
||||
{'class':"cmn-article_list"},
|
||||
dict(id="ABOUT-NIKKEI"),
|
||||
{'class':"cmn-sub_market"},
|
||||
]
|
||||
remove_tags_after = {'class':"cmn-pr_list"}
|
||||
|
||||
|
@ -1,5 +1,5 @@
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>'
|
||||
__copyright__ = '2010-2011, Darko Miletic <darko.miletic at gmail.com>'
|
||||
'''
|
||||
nrc.nl
|
||||
'''
|
||||
@ -15,13 +15,18 @@ class Pagina12(BasicNewsRecipe):
|
||||
oldest_article = 2
|
||||
max_articles_per_feed = 200
|
||||
no_stylesheets = True
|
||||
encoding = 'cp1252'
|
||||
encoding = 'utf8'
|
||||
use_embedded_content = False
|
||||
language = 'nl'
|
||||
country = 'NL'
|
||||
remove_empty_feeds = True
|
||||
masthead_url = 'http://www.nrc.nl/nrc.nl/images/logo_nrc.png'
|
||||
extra_css = ' body{font-family: Verdana,Arial,Helvetica,sans-serif } img{margin-bottom: 0.4em} h1,h2,h3{text-align:left} '
|
||||
extra_css = """
|
||||
body{font-family: Georgia,serif }
|
||||
img{margin-bottom: 0.4em; display: block}
|
||||
.bijschrift,.sectie{font-size: x-small}
|
||||
.sectie{color: gray}
|
||||
"""
|
||||
|
||||
conversion_options = {
|
||||
'comment' : description
|
||||
@ -30,21 +35,42 @@ class Pagina12(BasicNewsRecipe):
|
||||
, 'language' : language
|
||||
}
|
||||
|
||||
keep_only_tags = [dict(name='div',attrs={'class':'article clearfix'})]
|
||||
|
||||
keep_only_tags = [dict(attrs={'class':'uitstekendekeus'})]
|
||||
remove_tags = [
|
||||
dict(name=['meta','base','link','object','embed'])
|
||||
,dict(attrs={'class':['reclamespace','tags-and-sharing']})
|
||||
]
|
||||
remove_attributes=['lang']
|
||||
|
||||
feeds = [
|
||||
(u'Voorpagina' , u'http://feeds.feedburner.com/NRCHandelsbladVoorpagina' )
|
||||
,(u'Binnenland' , u'http://feeds.feedburner.com/NRCHandelsbladBinnenland' )
|
||||
,(u'Buitenland' , u'http://feeds.feedburner.com/NRCHandelsbladBuitenland' )
|
||||
,(u'Economie' , u'http://feeds.feedburner.com/NRCHandelsbladEconomie' )
|
||||
,(u'Kunst & Film' , u'http://feeds.feedburner.com/nrc/NRCHandelsbladKunstEnFilm')
|
||||
,(u'Sport' , u'http://feeds.feedburner.com/NRCHandelsbladSport' )
|
||||
,(u'Wetenschap ' , u'http://www.nrc.nl/rss/wetenschap' )
|
||||
(u'Voor nieuws', u'http://www.nrc.nl/nieuws/categorie/nieuws/rss.php' )
|
||||
,(u'Binnenland' , u'http://www.nrc.nl/nieuws/categorie/binnenland/rss.php' )
|
||||
,(u'Buitenland' , u'http://www.nrc.nl/nieuws/categorie/buitenland/rss.php' )
|
||||
,(u'Economie' , u'http://www.nrc.nl/nieuws/categorie/economie/rss.php' )
|
||||
,(u'Cultuur' , u'http://www.nrc.nl/nieuws/categorie/cultuur/rss.php' )
|
||||
,(u'Sport' , u'http://www.nrc.nl/nieuws/categorie/sport/rss.php' )
|
||||
,(u'Wetenschap ', u'http://www.nrc.nl/nieuws/categorie/wetenschap-nieuws/rss.php')
|
||||
]
|
||||
|
||||
def print_version(self, url):
|
||||
return url + '?service=Print'
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
return self.adeify_images(soup)
|
||||
for item in soup.findAll(style=True):
|
||||
del item['style']
|
||||
for item in soup.findAll('a'):
|
||||
limg = item.find('img')
|
||||
if item.string is not None:
|
||||
str = item.string
|
||||
item.replaceWith(str)
|
||||
else:
|
||||
if limg:
|
||||
item.name = 'div'
|
||||
atritems =['href','target','rel']
|
||||
for atit in atritems:
|
||||
if item.has_key(atit):
|
||||
del item[atit]
|
||||
else:
|
||||
str = self.tag_to_string(item)
|
||||
item.replaceWith(str)
|
||||
for item in soup.findAll('img'):
|
||||
if not item.has_key('alt'):
|
||||
item['alt'] = 'image'
|
||||
return soup
|
||||
|
@ -586,7 +586,7 @@ class NYTimes(BasicNewsRecipe):
|
||||
return self.strip_anchors(soup)
|
||||
|
||||
def postprocess_html(self,soup, True):
|
||||
|
||||
try:
|
||||
if self.one_picture_per_article:
|
||||
# Remove all images after first
|
||||
largeImg = soup.find(True, {'class':'articleSpanImage'})
|
||||
@ -621,10 +621,13 @@ class NYTimes(BasicNewsRecipe):
|
||||
cgFirst.insert(insertLoc,firstImg)
|
||||
else:
|
||||
self.log(">>> No class:'columnGroup first' found <<<")
|
||||
except:
|
||||
self.log("ERROR: One picture per article in postprocess_html")
|
||||
|
||||
try:
|
||||
# Change captions to italic
|
||||
for caption in soup.findAll(True, {'class':'caption'}) :
|
||||
if caption and caption.contents[0]:
|
||||
if caption and len(caption) > 0:
|
||||
cTag = Tag(soup, "p", [("class", "caption")])
|
||||
c = self.fixChars(self.tag_to_string(caption,use_alt=False)).strip()
|
||||
mp_off = c.find("More Photos")
|
||||
@ -632,7 +635,10 @@ class NYTimes(BasicNewsRecipe):
|
||||
c = c[:mp_off]
|
||||
cTag.insert(0, c)
|
||||
caption.replaceWith(cTag)
|
||||
except:
|
||||
self.log("ERROR: Problem in change captions to italic")
|
||||
|
||||
try:
|
||||
# Change <nyt_headline> to <h2>
|
||||
h1 = soup.find('h1')
|
||||
if h1:
|
||||
@ -653,7 +659,10 @@ class NYTimes(BasicNewsRecipe):
|
||||
hrs = soup.findAll('hr')
|
||||
for hr in hrs:
|
||||
hr.extract()
|
||||
except:
|
||||
self.log("ERROR: Problem in Change <nyt_headline> to <h2>")
|
||||
|
||||
try:
|
||||
# Change <h1> to <h3> - used in editorial blogs
|
||||
masthead = soup.find("h1")
|
||||
if masthead:
|
||||
@ -663,18 +672,27 @@ class NYTimes(BasicNewsRecipe):
|
||||
tag = Tag(soup, "h3")
|
||||
tag.insert(0, self.fixChars(masthead.contents[0]))
|
||||
masthead.replaceWith(tag)
|
||||
except:
|
||||
self.log("ERROR: Problem in Change <h1> to <h3> - used in editorial blogs")
|
||||
|
||||
try:
|
||||
# Change <span class="bold"> to <b>
|
||||
for subhead in soup.findAll(True, {'class':'bold'}) :
|
||||
if subhead.contents:
|
||||
bTag = Tag(soup, "b")
|
||||
bTag.insert(0, subhead.contents[0])
|
||||
subhead.replaceWith(bTag)
|
||||
except:
|
||||
self.log("ERROR: Problem in Change <h1> to <h3> - used in editorial blogs")
|
||||
|
||||
try:
|
||||
divTag = soup.find('div',attrs={'id':'articleBody'})
|
||||
if divTag:
|
||||
divTag['class'] = divTag['id']
|
||||
except:
|
||||
self.log("ERROR: Problem in soup.find(div,attrs={id:articleBody})")
|
||||
|
||||
try:
|
||||
# Add class="authorId" to <div> so we can format with CSS
|
||||
divTag = soup.find('div',attrs={'id':'authorId'})
|
||||
if divTag and divTag.contents[0]:
|
||||
@ -683,5 +701,32 @@ class NYTimes(BasicNewsRecipe):
|
||||
tag.insert(0, self.fixChars(self.tag_to_string(divTag.contents[0],
|
||||
use_alt=False)))
|
||||
divTag.replaceWith(tag)
|
||||
except:
|
||||
self.log("ERROR: Problem in Add class=authorId to <div> so we can format with CSS")
|
||||
|
||||
return soup
|
||||
|
||||
def populate_article_metadata(self, article, soup, first):
|
||||
shortparagraph = ""
|
||||
try:
|
||||
if len(article.text_summary.strip()) == 0:
|
||||
articlebodies = soup.findAll('div',attrs={'class':'articleBody'})
|
||||
if articlebodies:
|
||||
for articlebody in articlebodies:
|
||||
if articlebody:
|
||||
paras = articlebody.findAll('p')
|
||||
for p in paras:
|
||||
refparagraph = self.massageNCXText(self.tag_to_string(p,use_alt=False)).strip()
|
||||
#account for blank paragraphs and short paragraphs by appending them to longer ones
|
||||
if len(refparagraph) > 0:
|
||||
if len(refparagraph) > 70: #approximately one line of text
|
||||
article.summary = article.text_summary = shortparagraph + refparagraph
|
||||
return
|
||||
else:
|
||||
shortparagraph = refparagraph + " "
|
||||
if shortparagraph.strip().find(" ") == -1 and not shortparagraph.strip().endswith(":"):
|
||||
shortparagraph = shortparagraph + "- "
|
||||
except:
|
||||
self.log("Error creating article descriptions")
|
||||
return
|
||||
|
||||
|
@ -1,4 +1,5 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
@ -23,6 +24,10 @@ class NYTimes(BasicNewsRecipe):
|
||||
webEdition = False
|
||||
oldest_article = 7
|
||||
|
||||
# replace paid Kindle Version: the name will be changed to "The New York Times" to cause
|
||||
# previous paid versions of the new york times to best sent to the back issues folder on the kindle
|
||||
replaceKindleVersion = False
|
||||
|
||||
# includeSections: List of sections to include. If empty, all sections found will be included.
|
||||
# Otherwise, only the sections named will be included. For example,
|
||||
#
|
||||
@ -94,6 +99,10 @@ class NYTimes(BasicNewsRecipe):
|
||||
title='New York Times (Web)'
|
||||
description = 'New York Times on the Web'
|
||||
needs_subscription = True
|
||||
elif replaceKindleVersion:
|
||||
title='The New York Times'
|
||||
description = 'Today\'s New York Times'
|
||||
needs_subscription = True
|
||||
else:
|
||||
title='New York Times'
|
||||
description = 'Today\'s New York Times'
|
||||
@ -150,6 +159,11 @@ class NYTimes(BasicNewsRecipe):
|
||||
'relatedSearchesModule',
|
||||
'side_tool',
|
||||
'singleAd',
|
||||
'entry entry-utility', #added for DealBook
|
||||
'entry-tags', #added for DealBook
|
||||
'footer promos clearfix', #added for DealBook
|
||||
'footer links clearfix', #added for DealBook
|
||||
'inlineImage module', #added for DealBook
|
||||
re.compile('^subNavigation'),
|
||||
re.compile('^leaderboard'),
|
||||
re.compile('^module'),
|
||||
@ -183,6 +197,9 @@ class NYTimes(BasicNewsRecipe):
|
||||
'side_index',
|
||||
'side_tool',
|
||||
'toolsRight',
|
||||
'skybox', #added for DealBook
|
||||
'TopAd', #added for DealBook
|
||||
'related-content', #added for DealBook
|
||||
]),
|
||||
dict(name=['script', 'noscript', 'style','form','hr'])]
|
||||
no_stylesheets = True
|
||||
@ -237,7 +254,7 @@ class NYTimes(BasicNewsRecipe):
|
||||
def exclude_url(self,url):
|
||||
if not url.startswith("http"):
|
||||
return True
|
||||
if not url.endswith(".html"):
|
||||
if not url.endswith(".html") and 'dealbook.nytimes.com' not in url: #added for DealBook
|
||||
return True
|
||||
if 'nytimes.com' not in url:
|
||||
return True
|
||||
@ -560,7 +577,6 @@ class NYTimes(BasicNewsRecipe):
|
||||
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
|
||||
if self.webEdition & (self.oldest_article>0):
|
||||
date_tag = soup.find(True,attrs={'class': ['dateline','date']})
|
||||
if date_tag:
|
||||
@ -583,10 +599,13 @@ class NYTimes(BasicNewsRecipe):
|
||||
img_div = soup.find('div','inlineImage module')
|
||||
if img_div:
|
||||
img_div.extract()
|
||||
|
||||
|
||||
return self.strip_anchors(soup)
|
||||
|
||||
def postprocess_html(self,soup, True):
|
||||
|
||||
try:
|
||||
if self.one_picture_per_article:
|
||||
# Remove all images after first
|
||||
largeImg = soup.find(True, {'class':'articleSpanImage'})
|
||||
@ -621,10 +640,13 @@ class NYTimes(BasicNewsRecipe):
|
||||
cgFirst.insert(insertLoc,firstImg)
|
||||
else:
|
||||
self.log(">>> No class:'columnGroup first' found <<<")
|
||||
except:
|
||||
self.log("ERROR: One picture per article in postprocess_html")
|
||||
|
||||
try:
|
||||
# Change captions to italic
|
||||
for caption in soup.findAll(True, {'class':'caption'}) :
|
||||
if caption and caption.contents[0]:
|
||||
if caption and len(caption) > 0:
|
||||
cTag = Tag(soup, "p", [("class", "caption")])
|
||||
c = self.fixChars(self.tag_to_string(caption,use_alt=False)).strip()
|
||||
mp_off = c.find("More Photos")
|
||||
@ -632,9 +654,13 @@ class NYTimes(BasicNewsRecipe):
|
||||
c = c[:mp_off]
|
||||
cTag.insert(0, c)
|
||||
caption.replaceWith(cTag)
|
||||
except:
|
||||
self.log("ERROR: Problem in change captions to italic")
|
||||
|
||||
try:
|
||||
# Change <nyt_headline> to <h2>
|
||||
h1 = soup.find('h1')
|
||||
blogheadline = str(h1) #added for dealbook
|
||||
if h1:
|
||||
headline = h1.find("nyt_headline")
|
||||
if headline:
|
||||
@ -642,18 +668,50 @@ class NYTimes(BasicNewsRecipe):
|
||||
tag['class'] = "headline"
|
||||
tag.insert(0, self.fixChars(headline.contents[0]))
|
||||
h1.replaceWith(tag)
|
||||
elif blogheadline.find('entry-title'):#added for dealbook
|
||||
tag = Tag(soup, "h2")#added for dealbook
|
||||
tag['class'] = "headline"#added for dealbook
|
||||
tag.insert(0, self.fixChars(h1.contents[0]))#added for dealbook
|
||||
h1.replaceWith(tag)#added for dealbook
|
||||
|
||||
else:
|
||||
# Blog entry - replace headline, remove <hr> tags
|
||||
# Blog entry - replace headline, remove <hr> tags - BCC I think this is no longer functional 1-18-2011
|
||||
headline = soup.find('title')
|
||||
if headline:
|
||||
tag = Tag(soup, "h2")
|
||||
tag['class'] = "headline"
|
||||
tag.insert(0, self.fixChars(headline.contents[0]))
|
||||
tag.insert(0, self.fixChars(headline.renderContents()))
|
||||
soup.insert(0, tag)
|
||||
hrs = soup.findAll('hr')
|
||||
for hr in hrs:
|
||||
hr.extract()
|
||||
except:
|
||||
self.log("ERROR: Problem in Change <nyt_headline> to <h2>")
|
||||
|
||||
try:
|
||||
#if this is from a blog (dealbook, fix the byline format
|
||||
bylineauthor = soup.find('address',attrs={'class':'byline author vcard'})
|
||||
if bylineauthor:
|
||||
tag = Tag(soup, "h6")
|
||||
tag['class'] = "byline"
|
||||
tag.insert(0, self.fixChars(bylineauthor.renderContents()))
|
||||
bylineauthor.replaceWith(tag)
|
||||
except:
|
||||
self.log("ERROR: fixing byline author format")
|
||||
|
||||
try:
|
||||
#if this is a blog (dealbook) fix the credit style for the pictures
|
||||
blogcredit = soup.find('div',attrs={'class':'credit'})
|
||||
if blogcredit:
|
||||
tag = Tag(soup, "h6")
|
||||
tag['class'] = "credit"
|
||||
tag.insert(0, self.fixChars(blogcredit.renderContents()))
|
||||
blogcredit.replaceWith(tag)
|
||||
except:
|
||||
self.log("ERROR: fixing credit format")
|
||||
|
||||
|
||||
try:
|
||||
# Change <h1> to <h3> - used in editorial blogs
|
||||
masthead = soup.find("h1")
|
||||
if masthead:
|
||||
@ -663,18 +721,34 @@ class NYTimes(BasicNewsRecipe):
|
||||
tag = Tag(soup, "h3")
|
||||
tag.insert(0, self.fixChars(masthead.contents[0]))
|
||||
masthead.replaceWith(tag)
|
||||
except:
|
||||
self.log("ERROR: Problem in Change <h1> to <h3> - used in editorial blogs")
|
||||
|
||||
try:
|
||||
# Change <span class="bold"> to <b>
|
||||
for subhead in soup.findAll(True, {'class':'bold'}) :
|
||||
if subhead.contents:
|
||||
bTag = Tag(soup, "b")
|
||||
bTag.insert(0, subhead.contents[0])
|
||||
subhead.replaceWith(bTag)
|
||||
except:
|
||||
self.log("ERROR: Problem in Change <h1> to <h3> - used in editorial blogs")
|
||||
try:
|
||||
#remove the <strong> update tag
|
||||
blogupdated = soup.find('span', {'class':'update'})
|
||||
if blogupdated:
|
||||
blogupdated.replaceWith("")
|
||||
except:
|
||||
self.log("ERROR: Removing strong tag")
|
||||
|
||||
try:
|
||||
divTag = soup.find('div',attrs={'id':'articleBody'})
|
||||
if divTag:
|
||||
divTag['class'] = divTag['id']
|
||||
except:
|
||||
self.log("ERROR: Problem in soup.find(div,attrs={id:articleBody})")
|
||||
|
||||
try:
|
||||
# Add class="authorId" to <div> so we can format with CSS
|
||||
divTag = soup.find('div',attrs={'id':'authorId'})
|
||||
if divTag and divTag.contents[0]:
|
||||
@ -683,6 +757,31 @@ class NYTimes(BasicNewsRecipe):
|
||||
tag.insert(0, self.fixChars(self.tag_to_string(divTag.contents[0],
|
||||
use_alt=False)))
|
||||
divTag.replaceWith(tag)
|
||||
except:
|
||||
self.log("ERROR: Problem in Add class=authorId to <div> so we can format with CSS")
|
||||
|
||||
return soup
|
||||
def populate_article_metadata(self, article, soup, first):
|
||||
shortparagraph = ""
|
||||
try:
|
||||
if len(article.text_summary.strip()) == 0:
|
||||
articlebodies = soup.findAll('div',attrs={'class':'articleBody'})
|
||||
if articlebodies:
|
||||
for articlebody in articlebodies:
|
||||
if articlebody:
|
||||
paras = articlebody.findAll('p')
|
||||
for p in paras:
|
||||
refparagraph = self.massageNCXText(self.tag_to_string(p,use_alt=False)).strip()
|
||||
#account for blank paragraphs and short paragraphs by appending them to longer ones
|
||||
if len(refparagraph) > 0:
|
||||
if len(refparagraph) > 70: #approximately one line of text
|
||||
article.summary = article.text_summary = shortparagraph + refparagraph
|
||||
return
|
||||
else:
|
||||
shortparagraph = refparagraph + " "
|
||||
if shortparagraph.strip().find(" ") == -1 and not shortparagraph.strip().endswith(":"):
|
||||
shortparagraph = shortparagraph + "- "
|
||||
except:
|
||||
self.log("Error creating article descriptions")
|
||||
return
|
||||
|
||||
|
61
resources/recipes/pressthink.recipe
Normal file
61
resources/recipes/pressthink.recipe
Normal file
@ -0,0 +1,61 @@
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2011, Darko Miletic <darko.miletic at gmail.com>'
|
||||
'''
|
||||
pressthink.org
|
||||
'''
|
||||
|
||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||
|
||||
class PressThink(BasicNewsRecipe):
|
||||
title = 'PressThink'
|
||||
__author__ = 'Darko Miletic'
|
||||
description = 'Ghost of democracy in the media machine'
|
||||
oldest_article = 60
|
||||
max_articles_per_feed = 100
|
||||
no_stylesheets = True
|
||||
use_embedded_content = False
|
||||
encoding = 'utf8'
|
||||
publisher = 'Arthur L. Carter Journalism Institute'
|
||||
category = 'news, USA, world, economy, politics, media'
|
||||
language = 'en'
|
||||
publication_type = 'blog'
|
||||
extra_css = """
|
||||
body{ font-family: Helvetica,Arial,sans-serif }
|
||||
img{display: block; margin-bottom: 0.5em}
|
||||
h6{font-size: 1.1em; font-weight: bold}
|
||||
.post-author{font-family: Georgia,serif}
|
||||
.post-title{color: #AB0000}
|
||||
.says{color: gray}
|
||||
.comment {
|
||||
border-bottom: 1px dotted #555555;
|
||||
border-top: 1px dotted #DDDDDD;
|
||||
margin-left: 10px;
|
||||
min-height: 100px;
|
||||
padding: 15px 0 20px;
|
||||
}
|
||||
"""
|
||||
|
||||
conversion_options = {
|
||||
'comments' : description
|
||||
,'tags' : category
|
||||
,'language' : language
|
||||
,'publisher': publisher
|
||||
}
|
||||
|
||||
remove_tags = [dict(name=['form','iframe','embed','object','link','base','table','meta'])]
|
||||
keep_only_tags = [dict(attrs={'class':['post-title','post-author','entry','postmetadata alt','commentlist']})]
|
||||
|
||||
feeds = [(u'Articles', u'http://pressthink.org/feed/')]
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
for item in soup.findAll(style=True):
|
||||
del item['style']
|
||||
for item in soup.findAll('img', alt=False):
|
||||
item['alt'] = 'image'
|
||||
for alink in soup.findAll('a'):
|
||||
if alink.string is not None:
|
||||
tstr = alink.string
|
||||
alink.replaceWith(tstr)
|
||||
return soup
|
||||
|
||||
|
@ -21,17 +21,54 @@ class SeattleTimes(BasicNewsRecipe):
|
||||
encoding = 'cp1252'
|
||||
language = 'en'
|
||||
|
||||
|
||||
html2lrf_options = [
|
||||
'--comment' , description
|
||||
, '--category' , category
|
||||
, '--publisher', publisher
|
||||
feeds = [
|
||||
(u'Top Stories',
|
||||
u'http://seattletimes.nwsource.com/rss/home.xml'),
|
||||
#(u'Articles', u'http://seattletimes.nwsource.com/rss/seattletimes.xml')
|
||||
(u'Business & Technology',
|
||||
u'http://seattletimes.nwsource.com/rss/businesstechnology.xml'),
|
||||
(u'Personal Technology',
|
||||
u'http://seattletimes.nwsource.com/rss/personaltechnology.xml'),
|
||||
(u'Entertainment & the Arts',
|
||||
u'http://seattletimes.nwsource.com/rss/artsentertainment.xml'),
|
||||
(u'Health',
|
||||
u'http://seattletimes.nwsource.com/rss/health.xml'),
|
||||
(u'Living',
|
||||
u'http://seattletimes.nwsource.com/rss/living.xml'),
|
||||
(u'Local News',
|
||||
u'http://seattletimes.nwsource.com/rss/localnews.xml'),
|
||||
(u'Nation & World',
|
||||
u'http://seattletimes.nwsource.com/rss/nationworld.xml'),
|
||||
(u'Opinion',
|
||||
u'http://seattletimes.nwsource.com/rss/opinion.xml'),
|
||||
(u'Politics',
|
||||
u'http://seattletimes.nwsource.com/rss/politics.xml'),
|
||||
(u'Sports',
|
||||
u'http://seattletimes.nwsource.com/rss/sports.xml'),
|
||||
(u'Nicole Brodeur',
|
||||
u'http://seattletimes.nwsource.com/rss/nicolebrodeur.xml'),
|
||||
(u'Danny Westneat',
|
||||
u'http://seattletimes.nwsource.com/rss/dannywestneat.xml'),
|
||||
(u'Jerry Large',
|
||||
u'http://seattletimes.nwsource.com/rss/jerrylarge.xml'),
|
||||
(u'Ron Judd',
|
||||
u'http://seattletimes.nwsource.com/rss/ronjudd.xml'),
|
||||
(u'Education',
|
||||
u'http://seattletimes.nwsource.com/rss/education.xml'),
|
||||
(u'Letters to the Editor',
|
||||
u'http://seattletimes.nwsource.com/rss/northwestvoices.xml'),
|
||||
(u'Travel',
|
||||
u'http://seattletimes.nwsource.com/rss/travel.xml'),
|
||||
(u'Outdoors',
|
||||
u'http://seattletimes.nwsource.com/rss/outdoors.xml'),
|
||||
(u'Steve Kelley',
|
||||
u'http://seattletimes.nwsource.com/rss/stevekelley.xml'),
|
||||
(u'Jerry Brewer',
|
||||
u'http://seattletimes.nwsource.com/rss/jerrybrewer.xml'),
|
||||
(u'Most Read Articles',
|
||||
u'http://seattletimes.nwsource.com/rss/mostreadarticles.xml'),
|
||||
]
|
||||
|
||||
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
|
||||
|
||||
feeds = [(u'Articles', u'http://seattletimes.nwsource.com/rss/seattletimes.xml')]
|
||||
|
||||
remove_tags = [
|
||||
dict(name=['object','link','script'])
|
||||
,dict(name='p', attrs={'class':'permission'})
|
||||
|
@ -1,5 +1,5 @@
|
||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||
from calibre.ebooks.BeautifulSoup import BeautifulSoup
|
||||
#from calibre.ebooks.BeautifulSoup import BeautifulSoup
|
||||
from urllib import quote
|
||||
|
||||
class SportsIllustratedRecipe(BasicNewsRecipe) :
|
||||
@ -91,7 +91,7 @@ class SportsIllustratedRecipe(BasicNewsRecipe) :
|
||||
# expire : no idea what value to use
|
||||
# All this comes from the Javascript function that redirects to the print version. It's called PT() and is defined in the file 48.js
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
'''def preprocess_html(self, soup):
|
||||
header = soup.find('div', attrs = {'class' : 'siv_artheader'})
|
||||
homeMadeSoup = BeautifulSoup('<html><head></head><body></body></html>')
|
||||
body = homeMadeSoup.body
|
||||
@ -115,4 +115,5 @@ class SportsIllustratedRecipe(BasicNewsRecipe) :
|
||||
body.append(para)
|
||||
|
||||
return homeMadeSoup
|
||||
'''
|
||||
|
||||
|
@ -35,7 +35,6 @@ class TechnologyReview(BasicNewsRecipe):
|
||||
def get_article_url(self, article):
|
||||
return article.get('guid', article.get('id', None))
|
||||
|
||||
|
||||
def print_version(self, url):
|
||||
baseurl='http://www.technologyreview.com/printer_friendly_article.aspx?id='
|
||||
split1 = string.split(url,"/")
|
||||
@ -43,3 +42,25 @@ class TechnologyReview(BasicNewsRecipe):
|
||||
split2= string.split(xxx,"/")
|
||||
s = baseurl + split2[0]
|
||||
return s
|
||||
|
||||
|
||||
def postprocess_html(self,soup, True):
|
||||
#remove picture
|
||||
headerhtml = soup.find(True, {'class':'header'})
|
||||
headerhtml.replaceWith("")
|
||||
|
||||
#remove close button
|
||||
closehtml = soup.find(True, {'class':'close'})
|
||||
closehtml.replaceWith("")
|
||||
|
||||
#remove banner advertisement
|
||||
bannerhtml = soup.find(True, {'class':'bannerad'})
|
||||
bannerhtml.replaceWith("")
|
||||
|
||||
#thanks kiklop74! This code removes all links from the text
|
||||
for alink in soup.findAll('a'):
|
||||
if alink.string is not None:
|
||||
tstr = alink.string
|
||||
alink.replaceWith(tstr)
|
||||
|
||||
return soup
|
||||
|
25
resources/recipes/tri_city_herald.recipe
Normal file
25
resources/recipes/tri_city_herald.recipe
Normal file
@ -0,0 +1,25 @@
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class TriCityHeraldRecipe(BasicNewsRecipe):
|
||||
title = u'Tri-City Herald'
|
||||
description = 'The Tri-City Herald Mid-Columbia.'
|
||||
language = 'en'
|
||||
__author__ = 'Laura Gjovaag'
|
||||
oldest_article = 1.5
|
||||
max_articles_per_feed = 100
|
||||
no_stylesheets = True
|
||||
remove_javascript = True
|
||||
keep_only_tags = [
|
||||
dict(name='div', attrs={'id':'story_header'}),
|
||||
dict(name='img', attrs={'class':'imageCycle'}),
|
||||
dict(name='div', attrs={'id':['cycleImageCaption', 'story_body']})
|
||||
]
|
||||
remove_tags = [
|
||||
dict(name='div', attrs={'id':'story_mlt'}),
|
||||
dict(name='a', attrs={'id':'commentCount'}),
|
||||
dict(name=['script', 'noscript', 'style'])]
|
||||
extra_css = 'h1{font: bold 140%;} #cycleImageCaption{font: monospace 60%}'
|
||||
|
||||
feeds = [
|
||||
(u'Tri-City Herald Mid-Columbia', u'http://www.tri-cityherald.com/901/index.rss')
|
||||
]
|
80
resources/recipes/tyzden.recipe
Normal file
80
resources/recipes/tyzden.recipe
Normal file
@ -0,0 +1,80 @@
|
||||
#!/usr/bin/env python
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2011, Miroslav Vasko zemiak@gmail.com'
|
||||
|
||||
'''
|
||||
.tyzden, a weekly news magazine (a week old issue)
|
||||
'''
|
||||
from calibre import strftime
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
from datetime import date
|
||||
import re
|
||||
|
||||
class TyzdenRecipe(BasicNewsRecipe):
|
||||
__license__ = 'GPL v3'
|
||||
__author__ = 'zemiak'
|
||||
language = 'sk'
|
||||
version = 1
|
||||
|
||||
publisher = u'www.tyzden.sk'
|
||||
category = u'Magazine'
|
||||
description = u'A conservative weekly magazine. The latest free issue'
|
||||
|
||||
today = date.today()
|
||||
iso = today.isocalendar()
|
||||
year = iso[0]
|
||||
weeknum = iso[1]
|
||||
|
||||
if (weeknum > 1):
|
||||
weeknum -= 1
|
||||
|
||||
title = u'tyzden'
|
||||
|
||||
base_url_path = 'http://www.tyzden.sk/casopis/' + str(year) + '/' + str(weeknum)
|
||||
base_url = base_url_path + '.html'
|
||||
|
||||
oldest_article = 20
|
||||
max_articles_per_feed = 100
|
||||
remove_javascript = True
|
||||
|
||||
use_embedded_content = False
|
||||
no_stylesheets = True
|
||||
|
||||
keep_only_tags = []
|
||||
keep_only_tags.append(dict(name = 'h1'))
|
||||
keep_only_tags.append(dict(name = 'div', attrs = {'class': 'text_area top_nofoto'}))
|
||||
keep_only_tags.append(dict(name = 'div', attrs = {'class': 'text_block'}))
|
||||
|
||||
remove_tags_after = [dict(name = 'div', attrs = {'class': 'text_block'})]
|
||||
|
||||
def find_sections(self):
|
||||
soup = self.index_to_soup(self.base_url)
|
||||
# find cover pic
|
||||
imgdiv = soup.find('div', attrs = {'class': 'foto'})
|
||||
if imgdiv is not None:
|
||||
img = imgdiv.find('img')
|
||||
if img is not None:
|
||||
self.cover_url = 'http://www.tyzden.sk/' + img['src']
|
||||
# end find cover pic
|
||||
|
||||
for s in soup.findAll('a', attrs={'href': re.compile(r'rubrika/.*')}):
|
||||
yield (self.tag_to_string(s), s)
|
||||
|
||||
def find_articles(self, soup):
|
||||
for art in soup.findAllNext('a'):
|
||||
if (not art['href'].startswith('casopis/')):
|
||||
break;
|
||||
|
||||
url = art['href']
|
||||
title = self.tag_to_string(art)
|
||||
yield {
|
||||
'title': title, 'url':self.base_url_path + '/' + url, 'description':title,
|
||||
'date' : strftime('%a, %d %b'),
|
||||
}
|
||||
|
||||
def parse_index(self):
|
||||
feeds = []
|
||||
for title, soup in self.find_sections():
|
||||
feeds.append((title, list(self.find_articles(soup))))
|
||||
|
||||
return feeds
|
29
resources/recipes/wichita_eagle.recipe
Normal file
29
resources/recipes/wichita_eagle.recipe
Normal file
@ -0,0 +1,29 @@
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class AdvancedUserRecipe1294938721(BasicNewsRecipe):
|
||||
title = u'Wichita Eagle'
|
||||
language = 'en'
|
||||
__author__ = 'Jason Cameron'
|
||||
description = 'Daily news from the Wichita Eagle'
|
||||
oldest_article = 1
|
||||
max_articles_per_feed = 30
|
||||
keep_only_tags = [dict(name='div', attrs={'id':'wide'})]
|
||||
feeds = [
|
||||
(u'Local News',
|
||||
u'http://www.kansas.com/news/local/index.rss'),
|
||||
(u'National News',
|
||||
u'http://www.kansas.com/news/nation-world/index.rss'),
|
||||
(u'Sports',
|
||||
u'http://www.kansas.com/sports/index.rss'),
|
||||
(u'Opinion',
|
||||
u'http://www.kansas.com/opinion/index.rss'),
|
||||
(u'Life',
|
||||
u'http://www.kansas.com/living/index.rss'),
|
||||
(u'Entertainment',
|
||||
u'http://www.kansas.com/entertainment/index.rss')
|
||||
]
|
||||
|
||||
def print_version(self, url):
|
||||
urlparts = url.split('/')
|
||||
newadd = urlparts[5]+'/v-print'
|
||||
return url.replace(url, newadd.join(url.split(urlparts[5])))
|
@ -2,8 +2,10 @@
|
||||
__license__ = 'GPL v3'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import re
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
from calibre.ebooks.chardet import xml_to_unicode
|
||||
|
||||
class Wired_Daily(BasicNewsRecipe):
|
||||
|
||||
@ -15,30 +17,43 @@ class Wired_Daily(BasicNewsRecipe):
|
||||
|
||||
no_stylesheets = True
|
||||
|
||||
preprocess_regexps = [(re.compile(r'<head.*</head>', re.DOTALL), lambda m:
|
||||
'<head></head>')]
|
||||
|
||||
remove_tags_before = dict(name='div', id='content')
|
||||
remove_tags = [dict(id=['social_tools', 'outerWrapper', 'sidebar',
|
||||
'footer', 'advertisement', 'blog_subscription_unit',
|
||||
'brightcove_component']),
|
||||
{'class':'entryActions'},
|
||||
dict(name=['noscript', 'script'])]
|
||||
remove_tags = [dict(id=['header', 'commenting_module', 'post_nav',
|
||||
'social_tools', 'sidebar', 'footer', 'social_wishlist', 'pgwidget',
|
||||
'outerWrapper', 'inf_widget']),
|
||||
{'class':['entryActions', 'advertisement', 'entryTags']},
|
||||
dict(name=['noscript', 'script']),
|
||||
dict(name='h4', attrs={'class':re.compile(r'rat\d+')}),
|
||||
{'class':lambda x: x and x.startswith('contentjump')},
|
||||
dict(name='li', attrs={'class':['entryCategories', 'entryEdit']})]
|
||||
|
||||
|
||||
feeds = [
|
||||
('Top News', 'http://feeds.wired.com/wired/index'),
|
||||
('Culture', 'http://feeds.wired.com/wired/culture'),
|
||||
('Software', 'http://feeds.wired.com/wired/software'),
|
||||
('Mac', 'http://feeds.feedburner.com/cultofmac/bFow'),
|
||||
('Gadgets', 'http://feeds.wired.com/wired/gadgets'),
|
||||
('Cars', 'http://feeds.wired.com/wired/cars'),
|
||||
('Entertainment', 'http://feeds.wired.com/wired/entertainment'),
|
||||
('Gaming', 'http://feeds.wired.com/wired/gaming'),
|
||||
('Science', 'http://feeds.wired.com/wired/science'),
|
||||
('Med Tech', 'http://feeds.wired.com/wired/medtech'),
|
||||
('Politics', 'http://feeds.wired.com/wired/politics'),
|
||||
('Tech Biz', 'http://feeds.wired.com/wired/techbiz'),
|
||||
('Commentary', 'http://feeds.wired.com/wired/commentary'),
|
||||
('Product Reviews',
|
||||
'http://www.wired.com/reviews/feeds/latestProductsRss'),
|
||||
('Autopia', 'http://www.wired.com/autopia/feed/'),
|
||||
('Danger Room', 'http://www.wired.com/dangerroom/feed/'),
|
||||
('Epicenter', 'http://www.wired.com/epicenter/feed/'),
|
||||
('Gadget Lab', 'http://www.wired.com/gadgetlab/feed/'),
|
||||
('Geek Dad', 'http://www.wired.com/geekdad/feed/'),
|
||||
('Playbook', 'http://www.wired.com/playbook/feed/'),
|
||||
('Rawfile', 'http://www.wired.com/rawfile/feed/'),
|
||||
('This Day in Tech', 'http://www.wired.com/thisdayintech/feed/'),
|
||||
('Threat Level', 'http://www.wired.com/threatlevel/feed/'),
|
||||
('Underwire', 'http://www.wired.com/underwire/feed/'),
|
||||
('Web Monkey', 'http://www.webmonkey.com/feed/'),
|
||||
('Science', 'http://www.wired.com/wiredscience/feed/'),
|
||||
]
|
||||
|
||||
def populate_article_metadata(self, article, soup, first):
|
||||
if article.text_summary:
|
||||
article.text_summary = xml_to_unicode(article.text_summary,
|
||||
resolve_entities=True)[0]
|
||||
|
||||
def print_version(self, url):
|
||||
return url.replace('http://www.wired.com/', 'http://www.wired.com/print/')
|
||||
|
||||
return url + '/all/1'
|
||||
|
||||
|
21
resources/recipes/yakima_herald.recipe
Normal file
21
resources/recipes/yakima_herald.recipe
Normal file
@ -0,0 +1,21 @@
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class YakimaHeraldRepublicRecipe(BasicNewsRecipe):
|
||||
title = u'Yakima Herald-Republic'
|
||||
description = 'The Yakima Herald-Republic.'
|
||||
language = 'en'
|
||||
__author__ = 'Laura Gjovaag'
|
||||
oldest_article = 1.5
|
||||
max_articles_per_feed = 100
|
||||
no_stylesheets = True
|
||||
remove_javascript = True
|
||||
keep_only_tags = [
|
||||
dict(name='div', attrs={'id':['searchleft', 'headline_credit']}),
|
||||
dict(name='div', attrs={'class':['photo', 'cauthor', 'photocredit']}),
|
||||
dict(name='div', attrs={'id':['content_body', 'footerleft']})
|
||||
]
|
||||
extra_css = '.cauthor {font: monospace 60%;} .photocredit {font: monospace 60%}'
|
||||
|
||||
feeds = [
|
||||
(u'Yakima Herald Online', u'http://feeds.feedburner.com/yhronlinenews'),
|
||||
]
|
33
resources/recipes/zerohedge.recipe
Normal file
33
resources/recipes/zerohedge.recipe
Normal file
@ -0,0 +1,33 @@
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2011, Darko Miletic <darko.miletic at gmail.com>'
|
||||
'''
|
||||
www.zerohedge.com
|
||||
'''
|
||||
|
||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||
|
||||
class ZeroHedge(BasicNewsRecipe):
|
||||
title = 'Zero Hedge'
|
||||
__author__ = 'Darko Miletic'
|
||||
description = 'On a long enough timeline the survival rate for everyone drops to zero'
|
||||
oldest_article = 10
|
||||
max_articles_per_feed = 100
|
||||
no_stylesheets = True
|
||||
use_embedded_content = True
|
||||
encoding = 'utf8'
|
||||
publisher = 'zero hedge'
|
||||
category = 'news, USA, world, economy, politics'
|
||||
language = 'en'
|
||||
masthead_url = 'http://www.zerohedge.com/themes/newsflash/logo.png'
|
||||
publication_type = 'blog'
|
||||
extra_css = 'body{ font-family: sans-serif }'
|
||||
|
||||
conversion_options = {
|
||||
'comments' : description
|
||||
,'tags' : category
|
||||
,'language' : language
|
||||
,'publisher': publisher
|
||||
}
|
||||
|
||||
|
||||
feeds = [(u'Articles', u'http://feeds.feedburner.com/zerohedge/feed')]
|
28
resources/template-functions.json
Normal file
28
resources/template-functions.json
Normal file
@ -0,0 +1,28 @@
|
||||
{
|
||||
"contains": "def evaluate(self, formatter, kwargs, mi, locals,\n val, test, value_if_present, value_if_not):\n if re.search(test, val):\n return value_if_present\n else:\n return value_if_not\n",
|
||||
"divide": "def evaluate(self, formatter, kwargs, mi, locals, x, y):\n x = float(x if x else 0)\n y = float(y if y else 0)\n return unicode(x / y)\n",
|
||||
"uppercase": "def evaluate(self, formatter, kwargs, mi, locals, val):\n return val.upper()\n",
|
||||
"strcat": "def evaluate(self, formatter, kwargs, mi, locals, *args):\n i = 0\n res = ''\n for i in range(0, len(args)):\n res += args[i]\n return res\n",
|
||||
"substr": "def evaluate(self, formatter, kwargs, mi, locals, str_, start_, end_):\n return str_[int(start_): len(str_) if int(end_) == 0 else int(end_)]\n",
|
||||
"ifempty": "def evaluate(self, formatter, kwargs, mi, locals, val, value_if_empty):\n if val:\n return val\n else:\n return value_if_empty\n",
|
||||
"field": "def evaluate(self, formatter, kwargs, mi, locals, name):\n return formatter.get_value(name, [], kwargs)\n",
|
||||
"capitalize": "def evaluate(self, formatter, kwargs, mi, locals, val):\n return capitalize(val)\n",
|
||||
"list_item": "def evaluate(self, formatter, kwargs, mi, locals, val, index, sep):\n if not val:\n return ''\n index = int(index)\n val = val.split(sep)\n try:\n return val[index]\n except:\n return ''\n",
|
||||
"shorten": "def evaluate(self, formatter, kwargs, mi, locals,\n val, leading, center_string, trailing):\n l = max(0, int(leading))\n t = max(0, int(trailing))\n if len(val) > l + len(center_string) + t:\n return val[0:l] + center_string + ('' if t == 0 else val[-t:])\n else:\n return val\n",
|
||||
"re": "def evaluate(self, formatter, kwargs, mi, locals, val, pattern, replacement):\n return re.sub(pattern, replacement, val)\n",
|
||||
"add": "def evaluate(self, formatter, kwargs, mi, locals, x, y):\n x = float(x if x else 0)\n y = float(y if y else 0)\n return unicode(x + y)\n",
|
||||
"lookup": "def evaluate(self, formatter, kwargs, mi, locals, val, *args):\n if len(args) == 2: # here for backwards compatibility\n if val:\n return formatter.vformat('{'+args[0].strip()+'}', [], kwargs)\n else:\n return formatter.vformat('{'+args[1].strip()+'}', [], kwargs)\n if (len(args) % 2) != 1:\n raise ValueError(_('lookup requires either 2 or an odd number of arguments'))\n i = 0\n while i < len(args):\n if i + 1 >= len(args):\n return formatter.vformat('{' + args[i].strip() + '}', [], kwargs)\n if re.search(args[i], val):\n return formatter.vformat('{'+args[i+1].strip() + '}', [], kwargs)\n i += 2\n",
|
||||
"template": "def evaluate(self, formatter, kwargs, mi, locals, template):\n template = template.replace('[[', '{').replace(']]', '}')\n return formatter.safe_format(template, kwargs, 'TEMPLATE', mi)\n",
|
||||
"print": "def evaluate(self, formatter, kwargs, mi, locals, *args):\n print args\n return None\n",
|
||||
"titlecase": "def evaluate(self, formatter, kwargs, mi, locals, val):\n return titlecase(val)\n",
|
||||
"test": "def evaluate(self, formatter, kwargs, mi, locals, val, value_if_set, value_not_set):\n if val:\n return value_if_set\n else:\n return value_not_set\n",
|
||||
"eval": "def evaluate(self, formatter, kwargs, mi, locals, template):\n from formatter import eval_formatter\n template = template.replace('[[', '{').replace(']]', '}')\n return eval_formatter.safe_format(template, locals, 'EVAL', None)\n",
|
||||
"multiply": "def evaluate(self, formatter, kwargs, mi, locals, x, y):\n x = float(x if x else 0)\n y = float(y if y else 0)\n return unicode(x * y)\n",
|
||||
"subtract": "def evaluate(self, formatter, kwargs, mi, locals, x, y):\n x = float(x if x else 0)\n y = float(y if y else 0)\n return unicode(x - y)\n",
|
||||
"count": "def evaluate(self, formatter, kwargs, mi, locals, val, sep):\n return unicode(len(val.split(sep)))\n",
|
||||
"lowercase": "def evaluate(self, formatter, kwargs, mi, locals, val):\n return val.lower()\n",
|
||||
"assign": "def evaluate(self, formatter, kwargs, mi, locals, target, value):\n locals[target] = value\n return value\n",
|
||||
"switch": "def evaluate(self, formatter, kwargs, mi, locals, val, *args):\n if (len(args) % 2) != 1:\n raise ValueError(_('switch requires an odd number of arguments'))\n i = 0\n while i < len(args):\n if i + 1 >= len(args):\n return args[i]\n if re.search(args[i], val):\n return args[i+1]\n i += 2\n",
|
||||
"strcmp": "def evaluate(self, formatter, kwargs, mi, locals, x, y, lt, eq, gt):\n v = strcmp(x, y)\n if v < 0:\n return lt\n if v == 0:\n return eq\n return gt\n",
|
||||
"cmp": "def evaluate(self, formatter, kwargs, mi, locals, x, y, lt, eq, gt):\n x = float(x if x else 0)\n y = float(y if y else 0)\n if x < y:\n return lt\n if x == y:\n return eq\n return gt\n"
|
||||
}
|
@ -287,7 +287,7 @@
|
||||
<xsl:value-of select="count(preceding::rtf:footnote) + 1"/>
|
||||
<xsl:text>]</xsl:text>
|
||||
</xsl:when>
|
||||
<xsl:when test="(@superscript = 'true')">
|
||||
<xsl:when test="(@superscript)">
|
||||
<xsl:element name="sup">
|
||||
<xsl:element name="span">
|
||||
<xsl:attribute name="class">
|
||||
@ -297,7 +297,7 @@
|
||||
</xsl:element>
|
||||
</xsl:element>
|
||||
</xsl:when>
|
||||
<xsl:when test="(@underscript = 'true')">
|
||||
<xsl:when test="(@underscript or @subscript)">
|
||||
<xsl:element name="sub">
|
||||
<xsl:element name="span">
|
||||
<xsl:attribute name="class">
|
||||
|
@ -117,7 +117,6 @@ if iswindows:
|
||||
poppler_inc_dirs = consolidate('POPPLER_INC_DIR',
|
||||
r'%s\poppler;%s'%(sw_inc_dir, sw_inc_dir))
|
||||
|
||||
popplerqt4_inc_dirs = poppler_inc_dirs + [poppler_inc_dirs[1]+r'\qt4']
|
||||
poppler_lib_dirs = consolidate('POPPLER_LIB_DIR', sw_lib_dir)
|
||||
popplerqt4_lib_dirs = poppler_lib_dirs
|
||||
poppler_libs = ['poppler']
|
||||
@ -131,7 +130,6 @@ elif isosx:
|
||||
fc_lib = '/sw/lib'
|
||||
poppler_inc_dirs = consolidate('POPPLER_INC_DIR',
|
||||
'/sw/build/poppler-0.14.5/poppler:/sw/build/poppler-0.14.5')
|
||||
popplerqt4_inc_dirs = poppler_inc_dirs + [poppler_inc_dirs[0]+'/qt4']
|
||||
poppler_lib_dirs = consolidate('POPPLER_LIB_DIR',
|
||||
'/sw/lib')
|
||||
poppler_libs = ['poppler']
|
||||
@ -150,9 +148,6 @@ else:
|
||||
# Include directories
|
||||
poppler_inc_dirs = pkgconfig_include_dirs('poppler',
|
||||
'POPPLER_INC_DIR', '/usr/include/poppler')
|
||||
popplerqt4_inc_dirs = pkgconfig_include_dirs('poppler-qt4', '', '')
|
||||
if not popplerqt4_inc_dirs:
|
||||
popplerqt4_inc_dirs = poppler_inc_dirs + [poppler_inc_dirs[0]+'/qt4']
|
||||
png_inc_dirs = pkgconfig_include_dirs('libpng', 'PNG_INC_DIR',
|
||||
'/usr/include')
|
||||
magick_inc_dirs = pkgconfig_include_dirs('MagickWand', 'MAGICK_INC', '/usr/include/ImageMagick')
|
||||
@ -187,20 +182,17 @@ if not poppler_inc_dirs or not os.path.exists(
|
||||
poppler_error = \
|
||||
('Poppler not found on your system. Various PDF related',
|
||||
' functionality will not work. Use the POPPLER_INC_DIR and',
|
||||
' POPPLER_LIB_DIR environment variables.')
|
||||
|
||||
popplerqt4_error = None
|
||||
if not popplerqt4_inc_dirs or not os.path.exists(
|
||||
os.path.join(popplerqt4_inc_dirs[-1], 'poppler-qt4.h')):
|
||||
popplerqt4_error = \
|
||||
('Poppler Qt4 bindings not found on your system.')
|
||||
' POPPLER_LIB_DIR environment variables. calibre requires '
|
||||
' the poppler XPDF headers. If your distro does not '
|
||||
' include them you will have to re-compile poppler '
|
||||
' by hand with --enable-xpdf-headers')
|
||||
|
||||
magick_error = None
|
||||
if not magick_inc_dirs or not os.path.exists(os.path.join(magick_inc_dirs[0],
|
||||
'wand')):
|
||||
magick_error = ('ImageMagick not found on your system. '
|
||||
'Try setting the environment variables MAGICK_INC '
|
||||
'and MAGICK_LIB to help calibre locate the inclue and libbrary '
|
||||
'and MAGICK_LIB to help calibre locate the include and library '
|
||||
'files.')
|
||||
|
||||
podofo_lib = os.environ.get('PODOFO_LIB_DIR', podofo_lib)
|
||||
|
@ -43,8 +43,9 @@ class Stage3(Command):
|
||||
|
||||
description = 'Stage 3 of the publish process'
|
||||
sub_commands = ['upload_user_manual', 'upload_demo', 'sdist',
|
||||
'upload_to_google_code', 'tag_release', 'upload_to_server',
|
||||
'upload_to_sourceforge', 'upload_to_mobileread',
|
||||
'upload_to_google_code', 'upload_to_sourceforge',
|
||||
'tag_release', 'upload_to_server',
|
||||
'upload_to_mobileread',
|
||||
]
|
||||
|
||||
class Stage4(Command):
|
||||
|
@ -84,6 +84,23 @@ class Resources(Command):
|
||||
|
||||
cPickle.dump(complete, open(dest, 'wb'), -1)
|
||||
|
||||
self.info('\tCreating template-functions.json')
|
||||
dest = self.j(self.RESOURCES, 'template-functions.json')
|
||||
function_dict = {}
|
||||
import inspect
|
||||
from calibre.utils.formatter_functions import all_builtin_functions
|
||||
for obj in all_builtin_functions:
|
||||
eval_func = inspect.getmembers(obj,
|
||||
lambda x: inspect.ismethod(x) and x.__name__ == 'evaluate')
|
||||
try:
|
||||
lines = [l[4:] for l in inspect.getsourcelines(eval_func[0][1])[0]]
|
||||
except:
|
||||
continue
|
||||
lines = ''.join(lines)
|
||||
function_dict[obj.name] = lines
|
||||
import json
|
||||
json.dump(function_dict, open(dest, 'wb'), indent=4)
|
||||
|
||||
def clean(self):
|
||||
for x in ('scripts', 'recipes', 'ebook-convert-complete'):
|
||||
x = self.j(self.RESOURCES, x+'.pickle')
|
||||
|
@ -6,7 +6,7 @@ __license__ = 'GPL v3'
|
||||
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import os, re, cStringIO, base64, httplib, subprocess, hashlib, shutil
|
||||
import os, re, cStringIO, base64, httplib, subprocess, hashlib, shutil, time
|
||||
from subprocess import check_call
|
||||
from tempfile import NamedTemporaryFile, mkdtemp
|
||||
|
||||
@ -160,7 +160,7 @@ class UploadToGoogleCode(Command):
|
||||
|
||||
return 'multipart/form-data; boundary=%s' % BOUNDARY, CRLF.join(body)
|
||||
|
||||
def upload(self, fname, desc, labels=[]):
|
||||
def upload(self, fname, desc, labels=[], retry=0):
|
||||
form_fields = [('summary', desc)]
|
||||
form_fields.extend([('label', l.strip()) for l in labels])
|
||||
|
||||
@ -183,6 +183,10 @@ class UploadToGoogleCode(Command):
|
||||
|
||||
print 'Failed to upload with code %d and reason: %s'%(resp.status,
|
||||
resp.reason)
|
||||
if retry < 1:
|
||||
print 'Retrying in 5 seconds....'
|
||||
time.sleep(5)
|
||||
return self.upload(fname, desc, labels=labels, retry=retry+1)
|
||||
raise Exception('Failed to upload '+fname)
|
||||
|
||||
|
||||
|
@ -241,7 +241,7 @@ def get_parsed_proxy(typ='http', debug=True):
|
||||
return ans
|
||||
|
||||
|
||||
def browser(honor_time=True, max_time=2, mobile_browser=False):
|
||||
def browser(honor_time=True, max_time=2, mobile_browser=False, user_agent=None):
|
||||
'''
|
||||
Create a mechanize browser for web scraping. The browser handles cookies,
|
||||
refresh requests and ignores robots.txt. Also uses proxy if avaialable.
|
||||
@ -253,8 +253,10 @@ def browser(honor_time=True, max_time=2, mobile_browser=False):
|
||||
opener = Browser()
|
||||
opener.set_handle_refresh(True, max_time=max_time, honor_time=honor_time)
|
||||
opener.set_handle_robots(False)
|
||||
opener.addheaders = [('User-agent', ' Mozilla/5.0 (Windows; U; Windows CE 5.1; rv:1.8.1a3) Gecko/20060610 Minimo/0.016' if mobile_browser else \
|
||||
'Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.2.13) Gecko/20101210 Gentoo Firefox/3.6.13')]
|
||||
if user_agent is None:
|
||||
user_agent = ' Mozilla/5.0 (Windows; U; Windows CE 5.1; rv:1.8.1a3) Gecko/20060610 Minimo/0.016' if mobile_browser else \
|
||||
'Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.2.13) Gecko/20101210 Gentoo Firefox/3.6.13'
|
||||
opener.addheaders = [('User-agent', user_agent)]
|
||||
http_proxy = get_proxies().get('http', None)
|
||||
if http_proxy:
|
||||
opener.set_proxies({'http':http_proxy})
|
||||
@ -459,6 +461,18 @@ def force_unicode(obj, enc=preferred_encoding):
|
||||
obj = obj.decode('utf-8')
|
||||
return obj
|
||||
|
||||
def as_unicode(obj, enc=preferred_encoding):
|
||||
if not isbytestring(obj):
|
||||
try:
|
||||
obj = unicode(obj)
|
||||
except:
|
||||
try:
|
||||
obj = str(obj)
|
||||
except:
|
||||
obj = repr(obj)
|
||||
return force_unicode(obj, enc=enc)
|
||||
|
||||
|
||||
|
||||
def human_readable(size):
|
||||
""" Convert a size in bytes into a human readable form """
|
||||
|
@ -2,7 +2,7 @@ __license__ = 'GPL v3'
|
||||
__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
__appname__ = 'calibre'
|
||||
__version__ = '0.7.37'
|
||||
__version__ = '0.7.42'
|
||||
__author__ = "Kovid Goyal <kovid@kovidgoyal.net>"
|
||||
|
||||
import re
|
||||
|
@ -705,13 +705,17 @@ class ActionTweakEpub(InterfaceActionBase):
|
||||
name = 'Tweak ePub'
|
||||
actual_plugin = 'calibre.gui2.actions.tweak_epub:TweakEpubAction'
|
||||
|
||||
class ActionNextMatch(InterfaceActionBase):
|
||||
name = 'Next Match'
|
||||
actual_plugin = 'calibre.gui2.actions.next_match:NextMatchAction'
|
||||
|
||||
plugins += [ActionAdd, ActionFetchAnnotations, ActionGenerateCatalog,
|
||||
ActionConvert, ActionDelete, ActionEditMetadata, ActionView,
|
||||
ActionFetchNews, ActionSaveToDisk, ActionShowBookDetails,
|
||||
ActionRestart, ActionOpenFolder, ActionConnectShare,
|
||||
ActionSendToDevice, ActionHelp, ActionPreferences, ActionSimilarBooks,
|
||||
ActionAddToLibrary, ActionEditCollections, ActionChooseLibrary,
|
||||
ActionCopyToLibrary, ActionTweakEpub]
|
||||
ActionCopyToLibrary, ActionTweakEpub, ActionNextMatch]
|
||||
|
||||
# }}}
|
||||
|
||||
@ -843,6 +847,17 @@ class Plugboard(PreferencesPlugin):
|
||||
config_widget = 'calibre.gui2.preferences.plugboard'
|
||||
description = _('Change metadata fields before saving/sending')
|
||||
|
||||
class TemplateFunctions(PreferencesPlugin):
|
||||
name = 'TemplateFunctions'
|
||||
icon = I('template_funcs.png')
|
||||
gui_name = _('Template Functions')
|
||||
category = 'Advanced'
|
||||
gui_category = _('Advanced')
|
||||
category_order = 5
|
||||
name_order = 4
|
||||
config_widget = 'calibre.gui2.preferences.template_functions'
|
||||
description = _('Create your own template functions')
|
||||
|
||||
class Email(PreferencesPlugin):
|
||||
name = 'Email'
|
||||
icon = I('mail.png')
|
||||
@ -904,6 +919,6 @@ class Misc(PreferencesPlugin):
|
||||
|
||||
plugins += [LookAndFeel, Behavior, Columns, Toolbar, InputOptions,
|
||||
CommonOptions, OutputOptions, Adding, Saving, Sending, Plugboard,
|
||||
Email, Server, Plugins, Tweaks, Misc]
|
||||
Email, Server, Plugins, Tweaks, Misc, TemplateFunctions]
|
||||
|
||||
#}}}
|
||||
|
@ -160,18 +160,6 @@ class InputFormatPlugin(Plugin):
|
||||
'''
|
||||
raise NotImplementedError()
|
||||
|
||||
def preprocess_html(self, opts, html):
|
||||
'''
|
||||
This method is called by the conversion pipeline on all HTML before it
|
||||
is parsed. It is meant to be used to do any required preprocessing on
|
||||
the HTML, like removing hard line breaks, etc.
|
||||
|
||||
:param html: A unicode string
|
||||
:return: A unicode string
|
||||
'''
|
||||
return html
|
||||
|
||||
|
||||
def convert(self, stream, options, file_ext, log, accelerators):
|
||||
'''
|
||||
This method must be implemented in sub-classes. It must return
|
||||
|
@ -441,7 +441,7 @@ class TabletOutput(iPadOutput):
|
||||
|
||||
class SamsungGalaxy(TabletOutput):
|
||||
name = 'Samsung Galaxy'
|
||||
shortname = 'galaxy'
|
||||
short_name = 'galaxy'
|
||||
description = _('Intended for the Samsung Galaxy and similar tablet devices with '
|
||||
'a resolution of 600x1280')
|
||||
screen_size = comic_screen_size = (600, 1280)
|
||||
|
@ -21,21 +21,22 @@ class ANDROID(USBMS):
|
||||
# HTC
|
||||
0x0bb4 : { 0x0c02 : [0x100, 0x0227, 0x0226], 0x0c01 : [0x100, 0x0227], 0x0ff9
|
||||
: [0x0100, 0x0227, 0x0226], 0x0c87: [0x0100, 0x0227, 0x0226],
|
||||
0xc92 : [0x100], 0xc97: [0x226]},
|
||||
0xc92 : [0x100], 0xc97: [0x226], 0xc99 : [0x0100]},
|
||||
|
||||
# Eken
|
||||
0x040d : { 0x8510 : [0x0001], 0x0851 : [0x1] },
|
||||
|
||||
# Motorola
|
||||
0x22b8 : { 0x41d9 : [0x216], 0x2d61 : [0x100], 0x2d67 : [0x100],
|
||||
0x41db : [0x216], 0x4285 : [0x216], 0x42a3 : [0x216] },
|
||||
0x41db : [0x216], 0x4285 : [0x216], 0x42a3 : [0x216],
|
||||
0x4286 : [0x216], 0x42b3 : [0x216] },
|
||||
|
||||
# Sony Ericsson
|
||||
0xfce : { 0xd12e : [0x0100]},
|
||||
|
||||
# Google
|
||||
0x18d1 : { 0x4e11 : [0x0100, 0x226, 0x227], 0x4e12: [0x0100, 0x226,
|
||||
0x227], 0x4e21: [0x0100, 0x226, 0x227]},
|
||||
0x227], 0x4e21: [0x0100, 0x226, 0x227], 0xb058: [0x0222]},
|
||||
|
||||
# Samsung
|
||||
0x04e8 : { 0x681d : [0x0222, 0x0223, 0x0224, 0x0400],
|
||||
@ -52,6 +53,9 @@ class ANDROID(USBMS):
|
||||
# LG
|
||||
0x1004 : { 0x61cc : [0x100] },
|
||||
|
||||
# Archos
|
||||
0x0e79 : { 0x1419: [0x0216], 0x1420 : [0x0216]},
|
||||
|
||||
}
|
||||
EBOOK_DIR_MAIN = ['eBooks/import', 'wordplayer/calibretransfer', 'Books']
|
||||
EXTRA_CUSTOMIZATION_MESSAGE = _('Comma separated list of directories to '
|
||||
@ -60,18 +64,20 @@ class ANDROID(USBMS):
|
||||
EXTRA_CUSTOMIZATION_DEFAULT = ', '.join(EBOOK_DIR_MAIN)
|
||||
|
||||
VENDOR_NAME = ['HTC', 'MOTOROLA', 'GOOGLE_', 'ANDROID', 'ACER',
|
||||
'GT-I5700', 'SAMSUNG', 'DELL', 'LINUX', 'GOOGLE']
|
||||
'GT-I5700', 'SAMSUNG', 'DELL', 'LINUX', 'GOOGLE', 'ARCHOS',
|
||||
'TELECHIP']
|
||||
WINDOWS_MAIN_MEM = ['ANDROID_PHONE', 'A855', 'A853', 'INC.NEXUS_ONE',
|
||||
'__UMS_COMPOSITE', '_MB200', 'MASS_STORAGE', '_-_CARD', 'SGH-I897',
|
||||
'GT-I9000', 'FILE-STOR_GADGET', 'SGH-T959', 'SAMSUNG_ANDROID',
|
||||
'SCH-I500_CARD', 'SPH-D700_CARD', 'MB810', 'GT-P1000', 'DESIRE',
|
||||
'SGH-T849', '_MB300']
|
||||
'SGH-T849', '_MB300', 'A70S', 'S_ANDROID', 'A101IT']
|
||||
WINDOWS_CARD_A_MEM = ['ANDROID_PHONE', 'GT-I9000_CARD', 'SGH-I897',
|
||||
'FILE-STOR_GADGET', 'SGH-T959', 'SAMSUNG_ANDROID', 'GT-P1000_CARD']
|
||||
'FILE-STOR_GADGET', 'SGH-T959', 'SAMSUNG_ANDROID', 'GT-P1000_CARD',
|
||||
'A70S', 'A101IT']
|
||||
|
||||
OSX_MAIN_MEM = 'HTC Android Phone Media'
|
||||
OSX_MAIN_MEM = 'Android Device Main Memory'
|
||||
|
||||
MAIN_MEMORY_VOLUME_LABEL = 'Android Phone Internal Memory'
|
||||
MAIN_MEMORY_VOLUME_LABEL = 'Android Device Main Memory'
|
||||
|
||||
SUPPORTS_SUB_DIRS = True
|
||||
|
||||
|
@ -178,7 +178,7 @@ class INVESBOOK(EB600):
|
||||
|
||||
class BOOQ(EB600):
|
||||
name = 'Booq Device Interface'
|
||||
gui_name = 'Booq'
|
||||
gui_name = 'bq Reader'
|
||||
|
||||
FORMATS = ['epub', 'mobi', 'prc', 'fb2', 'pdf', 'doc', 'rtf', 'txt', 'html']
|
||||
|
||||
|
@ -27,7 +27,7 @@ class Book(Book_):
|
||||
|
||||
self.size = size # will be set later if None
|
||||
|
||||
if ContentType == '6':
|
||||
if ContentType == '6' and date is not None:
|
||||
self.datetime = time.strptime(date, "%Y-%m-%dT%H:%M:%S.%f")
|
||||
else:
|
||||
try:
|
||||
|
@ -33,8 +33,8 @@ class PALMPRE(USBMS):
|
||||
|
||||
class AVANT(USBMS):
|
||||
name = 'Booq Avant Device Interface'
|
||||
gui_name = 'Avant'
|
||||
description = _('Communicate with the Booq Avant')
|
||||
gui_name = 'bq Avant'
|
||||
description = _('Communicate with the Bq Avant')
|
||||
author = 'Kovid Goyal'
|
||||
supported_platforms = ['windows', 'osx', 'linux']
|
||||
|
||||
@ -106,7 +106,7 @@ class PDNOVEL(USBMS):
|
||||
WINDOWS_MAIN_MEM = WINDOWS_CARD_A_MEM = '__UMS_COMPOSITE'
|
||||
THUMBNAIL_HEIGHT = 130
|
||||
|
||||
EBOOK_DIR_MAIN = 'eBooks'
|
||||
EBOOK_DIR_MAIN = EBOOK_DIR_CARD_A = 'eBooks'
|
||||
SUPPORTS_SUB_DIRS = False
|
||||
DELETE_EXTS = ['.jpg', '.jpeg', '.png']
|
||||
|
||||
@ -193,6 +193,9 @@ class LUMIREAD(USBMS):
|
||||
|
||||
THUMBNAIL_HEIGHT = 200
|
||||
|
||||
VENDOR_NAME = 'ACER'
|
||||
WINDOWS_MAIN_MEM = WINDOWS_CARD_A_MEM = 'LUMIREAD_600'
|
||||
|
||||
def upload_cover(self, path, filename, metadata, filepath):
|
||||
if metadata.thumbnail and metadata.thumbnail[-1]:
|
||||
cfilepath = filepath.replace('/', os.sep)
|
||||
|
@ -91,3 +91,19 @@ class NOOK_COLOR(NOOK):
|
||||
|
||||
EBOOK_DIR_MAIN = 'My Files/Books'
|
||||
|
||||
'''
|
||||
def create_upload_path(self, path, mdata, fname, create_dirs=True):
|
||||
filepath = NOOK.create_upload_path(self, path, mdata, fname,
|
||||
create_dirs=create_dirs)
|
||||
edm = self.EBOOK_DIR_MAIN.replace('/', os.sep)
|
||||
npath = os.path.join(edm, _('News')) + os.sep
|
||||
if npath in filepath:
|
||||
filepath = filepath.replace(npath, os.sep.join('My Files',
|
||||
'Magazines')+os.sep)
|
||||
filedir = os.path.dirname(filepath)
|
||||
if create_dirs and not os.path.exists(filedir):
|
||||
os.makedirs(filedir)
|
||||
|
||||
return filepath
|
||||
'''
|
||||
|
||||
|
@ -76,17 +76,31 @@ class PRS505(USBMS):
|
||||
'sending DRMed books in which you cannot change the cover.'
|
||||
' WARNING: This option should only be used with newer '
|
||||
'SONY readers: 350, 650, 950 and newer.'),
|
||||
_('Refresh separate covers when using automatic management (newer readers)') +
|
||||
':::' +
|
||||
_('Set this option to have separate book covers uploaded '
|
||||
'every time you connect your device. Unset this option if '
|
||||
'you have so many books on the reader that performance is '
|
||||
'unacceptable.')
|
||||
]
|
||||
EXTRA_CUSTOMIZATION_DEFAULT = [
|
||||
', '.join(['series', 'tags']),
|
||||
False,
|
||||
False
|
||||
]
|
||||
|
||||
OPT_COLLECTIONS = 0
|
||||
OPT_UPLOAD_COVERS = 1
|
||||
OPT_REFRESH_COVERS = 2
|
||||
|
||||
plugboard = None
|
||||
plugboard_func = None
|
||||
|
||||
THUMBNAIL_HEIGHT = 200
|
||||
|
||||
MAX_PATH_LEN = 201 # 250 - (max(len(CACHE_THUMBNAIL), len(MEDIA_THUMBNAIL)) +
|
||||
# len('main_thumbnail.jpg') + 1)
|
||||
|
||||
def windows_filter_pnp_id(self, pnp_id):
|
||||
return '_LAUNCHER' in pnp_id
|
||||
|
||||
@ -171,7 +185,7 @@ class PRS505(USBMS):
|
||||
opts = self.settings()
|
||||
if opts.extra_customization:
|
||||
collections = [x.strip() for x in
|
||||
opts.extra_customization[0].split(',')]
|
||||
opts.extra_customization[self.OPT_COLLECTIONS].split(',')]
|
||||
else:
|
||||
collections = []
|
||||
debug_print('PRS505: collection fields:', collections)
|
||||
@ -183,6 +197,23 @@ class PRS505(USBMS):
|
||||
c.update(blists, collections, pb)
|
||||
c.write()
|
||||
|
||||
if opts.extra_customization[self.OPT_REFRESH_COVERS]:
|
||||
debug_print('PRS505: uploading covers in sync_booklists')
|
||||
for idx,bl in blists.items():
|
||||
prefix = self._card_a_prefix if idx == 1 else \
|
||||
self._card_b_prefix if idx == 2 \
|
||||
else self._main_prefix
|
||||
for book in bl:
|
||||
try:
|
||||
p = os.path.join(prefix, book.lpath)
|
||||
self._upload_cover(os.path.dirname(p),
|
||||
os.path.splitext(os.path.basename(p))[0],
|
||||
book, p)
|
||||
except:
|
||||
debug_print('FAILED to upload cover', p)
|
||||
else:
|
||||
debug_print('PRS505: NOT uploading covers in sync_booklists')
|
||||
|
||||
USBMS.sync_booklists(self, booklists, end_session=end_session)
|
||||
debug_print('PRS505: finished sync_booklists')
|
||||
|
||||
@ -199,11 +230,17 @@ class PRS505(USBMS):
|
||||
|
||||
def upload_cover(self, path, filename, metadata, filepath):
|
||||
opts = self.settings()
|
||||
if not opts.extra_customization[1]:
|
||||
if not opts.extra_customization[self.OPT_UPLOAD_COVERS]:
|
||||
# Building thumbnails disabled
|
||||
debug_print('PRS505: not uploading covers')
|
||||
debug_print('PRS505: not uploading cover')
|
||||
return
|
||||
debug_print('PRS505: uploading covers')
|
||||
debug_print('PRS505: uploading cover')
|
||||
try:
|
||||
self._upload_cover(path, filename, metadata, filepath)
|
||||
except:
|
||||
debug_print('FAILED to upload cover', filepath)
|
||||
|
||||
def _upload_cover(self, path, filename, metadata, filepath):
|
||||
if metadata.thumbnail and metadata.thumbnail[-1]:
|
||||
path = path.replace('/', os.sep)
|
||||
is_main = path.startswith(self._main_prefix)
|
||||
|
@ -98,6 +98,9 @@ class Device(DeviceConfig, DevicePlugin):
|
||||
# copy these back to the library
|
||||
BACKLOADING_ERROR_MESSAGE = None
|
||||
|
||||
#: The maximum length of paths created on the device
|
||||
MAX_PATH_LEN = 250
|
||||
|
||||
def reset(self, key='-1', log_packets=False, report_progress=None,
|
||||
detected_device=None):
|
||||
self._main_prefix = self._card_a_prefix = self._card_b_prefix = None
|
||||
@ -875,7 +878,7 @@ class Device(DeviceConfig, DevicePlugin):
|
||||
|
||||
def create_upload_path(self, path, mdata, fname, create_dirs=True):
|
||||
path = os.path.abspath(path)
|
||||
extra_components = []
|
||||
maxlen = self.MAX_PATH_LEN
|
||||
|
||||
special_tag = None
|
||||
if mdata.tags:
|
||||
@ -902,7 +905,7 @@ class Device(DeviceConfig, DevicePlugin):
|
||||
app_id = str(getattr(mdata, 'application_id', ''))
|
||||
# The db id will be in the created filename
|
||||
extra_components = get_components(template, mdata, fname,
|
||||
timefmt=opts.send_timefmt, length=250-len(app_id)-1)
|
||||
timefmt=opts.send_timefmt, length=maxlen-len(app_id)-1)
|
||||
if not extra_components:
|
||||
extra_components.append(sanitize(self.filename_callback(fname,
|
||||
mdata)))
|
||||
@ -937,12 +940,11 @@ class Device(DeviceConfig, DevicePlugin):
|
||||
return ans
|
||||
|
||||
extra_components = list(map(remove_trailing_periods, extra_components))
|
||||
components = shorten_components_to(250 - len(path), extra_components)
|
||||
components = shorten_components_to(maxlen - len(path), extra_components)
|
||||
components = self.sanitize_path_components(components)
|
||||
filepath = os.path.join(path, *components)
|
||||
filedir = os.path.dirname(filepath)
|
||||
|
||||
|
||||
if create_dirs and not os.path.exists(filedir):
|
||||
os.makedirs(filedir)
|
||||
|
||||
|
@ -18,7 +18,7 @@
|
||||
|
||||
__version__ = "1.0"
|
||||
|
||||
import re
|
||||
import re, codecs
|
||||
|
||||
def detect(aBuf):
|
||||
import calibre.ebooks.chardet.universaldetector as universaldetector
|
||||
@ -83,9 +83,11 @@ def xml_to_unicode(raw, verbose=False, strip_encoding_pats=False,
|
||||
if not raw:
|
||||
return u'', encoding
|
||||
if not isinstance(raw, unicode):
|
||||
if raw.startswith('\xff\xfe'):
|
||||
if raw.startswith(codecs.BOM_UTF8):
|
||||
raw, encoding = raw.decode('utf-8')[1:], 'utf-8'
|
||||
elif raw.startswith(codecs.BOM_UTF16_LE):
|
||||
raw, encoding = raw.decode('utf-16-le')[1:], 'utf-16-le'
|
||||
elif raw.startswith('\xfe\xff'):
|
||||
elif raw.startswith(codecs.BOM_UTF16_BE):
|
||||
raw, encoding = raw.decode('utf-16-be')[1:], 'utf-16-be'
|
||||
if not isinstance(raw, unicode):
|
||||
for pat in ENCODING_PATS:
|
||||
|
@ -75,7 +75,7 @@ class CHMInput(InputFormatPlugin):
|
||||
def _create_oebbook(self, hhcpath, basedir, opts, log, mi):
|
||||
from calibre.ebooks.conversion.plumber import create_oebbook
|
||||
from calibre.ebooks.oeb.base import DirContainer
|
||||
oeb = create_oebbook(log, None, opts, self,
|
||||
oeb = create_oebbook(log, None, opts,
|
||||
encoding=opts.input_encoding, populate=False)
|
||||
self.oeb = oeb
|
||||
|
||||
|
@ -42,6 +42,12 @@ option.
|
||||
For full documentation of the conversion system see
|
||||
''') + 'http://calibre-ebook.com/user_manual/conversion.html'
|
||||
|
||||
HEURISTIC_OPTIONS = ['markup_chapter_headings',
|
||||
'italicize_common_cases', 'fix_indents',
|
||||
'html_unwrap_factor', 'unwrap_lines',
|
||||
'delete_blank_paragraphs', 'format_scene_breaks',
|
||||
'dehyphenate', 'renumber_headings']
|
||||
|
||||
def print_help(parser, log):
|
||||
help = parser.format_help().encode(preferred_encoding, 'replace')
|
||||
log(help)
|
||||
@ -83,6 +89,8 @@ def option_recommendation_to_cli_option(add_option, rec):
|
||||
if opt.long_switch == 'verbose':
|
||||
attrs['action'] = 'count'
|
||||
attrs.pop('type', '')
|
||||
if opt.name in HEURISTIC_OPTIONS and rec.recommended_value is True:
|
||||
switches = ['--disable-'+opt.long_switch]
|
||||
add_option(Option(*switches, **attrs))
|
||||
|
||||
def add_input_output_options(parser, plumber):
|
||||
@ -126,8 +134,24 @@ def add_pipeline_options(parser, plumber):
|
||||
'margin_top', 'margin_left', 'margin_right',
|
||||
'margin_bottom', 'change_justification',
|
||||
'insert_blank_line', 'remove_paragraph_spacing','remove_paragraph_spacing_indent_size',
|
||||
'asciiize', 'remove_header', 'header_regex',
|
||||
'remove_footer', 'footer_regex',
|
||||
'asciiize',
|
||||
]
|
||||
),
|
||||
|
||||
'HEURISTIC PROCESSING' : (
|
||||
_('Modify the document text and structure using common'
|
||||
' patterns. Disabled by default. Use %s to enable. '
|
||||
' Individual actions can be disabled with the %s options.')
|
||||
% ('--enable-heuristics', '--disable-*'),
|
||||
['enable_heuristics'] + HEURISTIC_OPTIONS
|
||||
),
|
||||
|
||||
'SEARCH AND REPLACE' : (
|
||||
_('Modify the document text and structure using user defined patterns.'),
|
||||
[
|
||||
'sr1_search', 'sr1_replace',
|
||||
'sr2_search', 'sr2_replace',
|
||||
'sr3_search', 'sr3_replace',
|
||||
]
|
||||
),
|
||||
|
||||
@ -137,7 +161,6 @@ def add_pipeline_options(parser, plumber):
|
||||
'chapter', 'chapter_mark',
|
||||
'prefer_metadata_cover', 'remove_first_image',
|
||||
'insert_metadata', 'page_breaks_before',
|
||||
'preprocess_html', 'html_unwrap_factor',
|
||||
]
|
||||
),
|
||||
|
||||
@ -164,7 +187,8 @@ def add_pipeline_options(parser, plumber):
|
||||
|
||||
}
|
||||
|
||||
group_order = ['', 'LOOK AND FEEL', 'STRUCTURE DETECTION',
|
||||
group_order = ['', 'LOOK AND FEEL', 'HEURISTIC PROCESSING',
|
||||
'SEARCH AND REPLACE', 'STRUCTURE DETECTION',
|
||||
'TABLE OF CONTENTS', 'METADATA', 'DEBUG']
|
||||
|
||||
for group in group_order:
|
||||
|
@ -72,7 +72,8 @@ class Plumber(object):
|
||||
]
|
||||
|
||||
def __init__(self, input, output, log, report_progress=DummyReporter(),
|
||||
dummy=False, merge_plugin_recs=True, abort_after_input_dump=False):
|
||||
dummy=False, merge_plugin_recs=True, abort_after_input_dump=False,
|
||||
override_input_metadata=False):
|
||||
'''
|
||||
:param input: Path to input file.
|
||||
:param output: Path to output file/directory
|
||||
@ -87,7 +88,9 @@ class Plumber(object):
|
||||
self.log = log
|
||||
self.ui_reporter = report_progress
|
||||
self.abort_after_input_dump = abort_after_input_dump
|
||||
self.override_input_metadata = override_input_metadata
|
||||
|
||||
# Pipeline options {{{
|
||||
# Initialize the conversion options that are independent of input and
|
||||
# output formats. The input and output plugins can still disable these
|
||||
# options via recommendations.
|
||||
@ -375,23 +378,6 @@ OptionRecommendation(name='insert_metadata',
|
||||
)
|
||||
),
|
||||
|
||||
OptionRecommendation(name='preprocess_html',
|
||||
recommended_value=False, level=OptionRecommendation.LOW,
|
||||
help=_('Attempt to detect and correct hard line breaks and other '
|
||||
'problems in the source file. This may make things worse, so use '
|
||||
'with care.'
|
||||
)
|
||||
),
|
||||
|
||||
OptionRecommendation(name='html_unwrap_factor',
|
||||
recommended_value=0.40, level=OptionRecommendation.LOW,
|
||||
help=_('Scale used to determine the length at which a line should '
|
||||
'be unwrapped if preprocess is enabled. Valid values are a decimal between 0 and 1. The '
|
||||
'default is 0.40, just below the median line length. This will unwrap typical books '
|
||||
' with hard line breaks, but should be reduced if the line length is variable.'
|
||||
)
|
||||
),
|
||||
|
||||
OptionRecommendation(name='smarten_punctuation',
|
||||
recommended_value=False, level=OptionRecommendation.LOW,
|
||||
help=_('Convert plain quotes, dashes and ellipsis to their '
|
||||
@ -400,32 +386,6 @@ OptionRecommendation(name='smarten_punctuation',
|
||||
)
|
||||
),
|
||||
|
||||
OptionRecommendation(name='remove_header',
|
||||
recommended_value=False, level=OptionRecommendation.LOW,
|
||||
help=_('Use a regular expression to try and remove the header.'
|
||||
)
|
||||
),
|
||||
|
||||
OptionRecommendation(name='header_regex',
|
||||
recommended_value='(?i)(?<=<hr>)((\s*<a name=\d+></a>((<img.+?>)*<br>\s*)?\d+<br>\s*.*?\s*)|(\s*<a name=\d+></a>((<img.+?>)*<br>\s*)?.*?<br>\s*\d+))(?=<br>)',
|
||||
level=OptionRecommendation.LOW,
|
||||
help=_('The regular expression to use to remove the header.'
|
||||
)
|
||||
),
|
||||
|
||||
OptionRecommendation(name='remove_footer',
|
||||
recommended_value=False, level=OptionRecommendation.LOW,
|
||||
help=_('Use a regular expression to try and remove the footer.'
|
||||
)
|
||||
),
|
||||
|
||||
OptionRecommendation(name='footer_regex',
|
||||
recommended_value='(?i)(?<=<hr>)((\s*<a name=\d+></a>((<img.+?>)*<br>\s*)?\d+<br>\s*.*?\s*)|(\s*<a name=\d+></a>((<img.+?>)*<br>\s*)?.*?<br>\s*\d+))(?=<br>)',
|
||||
level=OptionRecommendation.LOW,
|
||||
help=_('The regular expression to use to remove the footer.'
|
||||
)
|
||||
),
|
||||
|
||||
OptionRecommendation(name='read_metadata_from_opf',
|
||||
recommended_value=None, level=OptionRecommendation.LOW,
|
||||
short_switch='m',
|
||||
@ -526,7 +486,91 @@ OptionRecommendation(name='timestamp',
|
||||
recommended_value=None, level=OptionRecommendation.LOW,
|
||||
help=_('Set the book timestamp (used by the date column in calibre).')),
|
||||
|
||||
OptionRecommendation(name='enable_heuristics',
|
||||
recommended_value=False, level=OptionRecommendation.LOW,
|
||||
help=_('Enable heuristic processing. This option must be set for any '
|
||||
'heuristic processing to take place.')),
|
||||
|
||||
OptionRecommendation(name='markup_chapter_headings',
|
||||
recommended_value=True, level=OptionRecommendation.LOW,
|
||||
help=_('Detect unformatted chapter headings and sub headings. Change '
|
||||
'them to h2 and h3 tags. This setting will not create a TOC, '
|
||||
'but can be used in conjunction with structure detection to create '
|
||||
'one.')),
|
||||
|
||||
OptionRecommendation(name='italicize_common_cases',
|
||||
recommended_value=True, level=OptionRecommendation.LOW,
|
||||
help=_('Look for common words and patterns that denote '
|
||||
'italics and italicize them.')),
|
||||
|
||||
OptionRecommendation(name='fix_indents',
|
||||
recommended_value=True, level=OptionRecommendation.LOW,
|
||||
help=_('Turn indentation created from multiple non-breaking space entities '
|
||||
'into CSS indents.')),
|
||||
|
||||
OptionRecommendation(name='html_unwrap_factor',
|
||||
recommended_value=0.40, level=OptionRecommendation.LOW,
|
||||
help=_('Scale used to determine the length at which a line should '
|
||||
'be unwrapped. Valid values are a decimal between 0 and 1. The '
|
||||
'default is 0.4, just below the median line length. If only a '
|
||||
'few lines in the document require unwrapping this value should '
|
||||
'be reduced')),
|
||||
|
||||
OptionRecommendation(name='unwrap_lines',
|
||||
recommended_value=True, level=OptionRecommendation.LOW,
|
||||
help=_('Unwrap lines using punctuation and other formatting clues.')),
|
||||
|
||||
OptionRecommendation(name='delete_blank_paragraphs',
|
||||
recommended_value=True, level=OptionRecommendation.LOW,
|
||||
help=_('Remove empty paragraphs from the document when they exist between '
|
||||
'every other paragraph')),
|
||||
|
||||
OptionRecommendation(name='format_scene_breaks',
|
||||
recommended_value=True, level=OptionRecommendation.LOW,
|
||||
help=_('Left aligned scene break markers are center aligned. '
|
||||
'Replace soft scene breaks that use multiple blank lines with'
|
||||
'horizontal rules.')),
|
||||
|
||||
OptionRecommendation(name='dehyphenate',
|
||||
recommended_value=True, level=OptionRecommendation.LOW,
|
||||
help=_('Analyze hyphenated words throughout the document. The '
|
||||
'document itself is used as a dictionary to determine whether hyphens '
|
||||
'should be retained or removed.')),
|
||||
|
||||
OptionRecommendation(name='renumber_headings',
|
||||
recommended_value=True, level=OptionRecommendation.LOW,
|
||||
help=_('Looks for occurrences of sequential <h1> or <h2> tags. '
|
||||
'The tags are renumbered to prevent splitting in the middle '
|
||||
'of chapter headings.')),
|
||||
|
||||
OptionRecommendation(name='sr1_search',
|
||||
recommended_value='', level=OptionRecommendation.LOW,
|
||||
help=_('Search pattern (regular expression) to be replaced with '
|
||||
'sr1-replace.')),
|
||||
|
||||
OptionRecommendation(name='sr1_replace',
|
||||
recommended_value='', level=OptionRecommendation.LOW,
|
||||
help=_('Replacement to replace the text found with sr1-search.')),
|
||||
|
||||
OptionRecommendation(name='sr2_search',
|
||||
recommended_value='', level=OptionRecommendation.LOW,
|
||||
help=_('Search pattern (regular expression) to be replaced with '
|
||||
'sr2-replace.')),
|
||||
|
||||
OptionRecommendation(name='sr2_replace',
|
||||
recommended_value='', level=OptionRecommendation.LOW,
|
||||
help=_('Replacement to replace the text found with sr2-search.')),
|
||||
|
||||
OptionRecommendation(name='sr3_search',
|
||||
recommended_value='', level=OptionRecommendation.LOW,
|
||||
help=_('Search pattern (regular expression) to be replaced with '
|
||||
'sr3-replace.')),
|
||||
|
||||
OptionRecommendation(name='sr3_replace',
|
||||
recommended_value='', level=OptionRecommendation.LOW,
|
||||
help=_('Replacement to replace the text found with sr3-search.')),
|
||||
]
|
||||
# }}}
|
||||
|
||||
input_fmt = os.path.splitext(self.input)[1]
|
||||
if not input_fmt:
|
||||
@ -859,7 +903,6 @@ OptionRecommendation(name='timestamp',
|
||||
self.opts_to_mi(self.user_metadata)
|
||||
if not hasattr(self.oeb, 'manifest'):
|
||||
self.oeb = create_oebbook(self.log, self.oeb, self.opts,
|
||||
self.input_plugin,
|
||||
encoding=self.input_plugin.output_encoding)
|
||||
self.input_plugin.postprocess_book(self.oeb, self.opts, self.log)
|
||||
self.opts.is_image_collection = self.input_plugin.is_image_collection
|
||||
@ -883,7 +926,8 @@ OptionRecommendation(name='timestamp',
|
||||
self.opts.dest = self.opts.output_profile
|
||||
|
||||
from calibre.ebooks.oeb.transforms.metadata import MergeMetadata
|
||||
MergeMetadata()(self.oeb, self.user_metadata, self.opts)
|
||||
MergeMetadata()(self.oeb, self.user_metadata, self.opts,
|
||||
override_input_metadata=self.override_input_metadata)
|
||||
pr(0.2)
|
||||
self.flush()
|
||||
|
||||
@ -969,14 +1013,15 @@ OptionRecommendation(name='timestamp',
|
||||
self.log(self.output_fmt.upper(), 'output written to', self.output)
|
||||
self.flush()
|
||||
|
||||
def create_oebbook(log, path_or_stream, opts, input_plugin, reader=None,
|
||||
def create_oebbook(log, path_or_stream, opts, reader=None,
|
||||
encoding='utf-8', populate=True):
|
||||
'''
|
||||
Create an OEBBook.
|
||||
'''
|
||||
from calibre.ebooks.oeb.base import OEBBook
|
||||
html_preprocessor = HTMLPreProcessor(input_plugin.preprocess_html,
|
||||
opts.preprocess_html, opts)
|
||||
html_preprocessor = HTMLPreProcessor(log, opts)
|
||||
if not encoding:
|
||||
encoding = None
|
||||
oeb = OEBBook(log, html_preprocessor,
|
||||
pretty_print=opts.pretty_print, input_encoding=encoding)
|
||||
if not populate:
|
||||
|
@ -7,7 +7,7 @@ __docformat__ = 'restructuredtext en'
|
||||
|
||||
import functools, re
|
||||
|
||||
from calibre import entity_to_unicode
|
||||
from calibre import entity_to_unicode, as_unicode
|
||||
|
||||
XMLDECL_RE = re.compile(r'^\s*<[?]xml.*?[?]>')
|
||||
SVG_NS = 'http://www.w3.org/2000/svg'
|
||||
@ -78,6 +78,8 @@ class DocAnalysis(object):
|
||||
linere = re.compile('(?<=<br>)(?!\s*<br>).*?(?=<br>)', re.DOTALL)
|
||||
elif format == 'spanned_html':
|
||||
linere = re.compile('(?<=<span).*?(?=</span>)', re.DOTALL)
|
||||
elif format == 'txt':
|
||||
linere = re.compile('.*?\n')
|
||||
self.lines = linere.findall(raw)
|
||||
|
||||
def line_length(self, percent):
|
||||
@ -172,13 +174,19 @@ class Dehyphenator(object):
|
||||
retain hyphens.
|
||||
'''
|
||||
|
||||
def __init__(self):
|
||||
def __init__(self, verbose=0, log=None):
|
||||
self.log = log
|
||||
self.verbose = verbose
|
||||
# Add common suffixes to the regex below to increase the likelihood of a match -
|
||||
# don't add suffixes which are also complete words, such as 'able' or 'sex'
|
||||
self.removesuffixes = re.compile(r"((ed)?ly|('e)?s|a?(t|s)?ion(s|al(ly)?)?|ings?|er|(i)?ous|(i|a)ty|(it)?ies|ive|gence|istic(ally)?|(e|a)nce|ment(s)?|ism|ated|(e|u)ct(ed)?|ed|(i|ed)?ness|(e|a)ncy|ble|ier|al|ex)$", re.IGNORECASE)
|
||||
# only remove if it's not already the point of hyphenation
|
||||
self.suffix_string = "((ed)?ly|'?e?s||a?(t|s)?ion(s|al(ly)?)?|ings?|er|(i)?ous|(i|a)ty|(it)?ies|ive|gence|istic(ally)?|(e|a)nce|m?ents?|ism|ated|(e|u)ct(ed)?|ed|(i|ed)?ness|(e|a)ncy|ble|ier|al|ex|ian)$"
|
||||
self.suffixes = re.compile(r"^%s" % self.suffix_string, re.IGNORECASE)
|
||||
self.removesuffixes = re.compile(r"%s" % self.suffix_string, re.IGNORECASE)
|
||||
# remove prefixes if the prefix was not already the point of hyphenation
|
||||
self.prefixes = re.compile(r'^(dis|re|un|in|ex)$', re.IGNORECASE)
|
||||
self.removeprefix = re.compile(r'^(dis|re|un|in|ex)', re.IGNORECASE)
|
||||
self.prefix_string = '^(dis|re|un|in|ex)'
|
||||
self.prefixes = re.compile(r'%s$' % self.prefix_string, re.IGNORECASE)
|
||||
self.removeprefix = re.compile(r'%s' % self.prefix_string, re.IGNORECASE)
|
||||
|
||||
def dehyphenate(self, match):
|
||||
firsthalf = match.group('firstpart')
|
||||
@ -189,31 +197,48 @@ class Dehyphenator(object):
|
||||
wraptags = ''
|
||||
hyphenated = unicode(firsthalf) + "-" + unicode(secondhalf)
|
||||
dehyphenated = unicode(firsthalf) + unicode(secondhalf)
|
||||
if self.suffixes.match(secondhalf) is None:
|
||||
lookupword = self.removesuffixes.sub('', dehyphenated)
|
||||
if self.prefixes.match(firsthalf) is None:
|
||||
else:
|
||||
lookupword = dehyphenated
|
||||
if len(firsthalf) > 4 and self.prefixes.match(firsthalf) is None:
|
||||
lookupword = self.removeprefix.sub('', lookupword)
|
||||
#print "lookup word is: "+str(lookupword)+", orig is: " + str(hyphenated)
|
||||
if self.verbose > 2:
|
||||
self.log("lookup word is: "+str(lookupword)+", orig is: " + str(hyphenated))
|
||||
try:
|
||||
searchresult = self.html.find(lookupword.lower())
|
||||
except:
|
||||
return hyphenated
|
||||
if self.format == 'html_cleanup':
|
||||
if self.format == 'html_cleanup' or self.format == 'txt_cleanup':
|
||||
if self.html.find(lookupword) != -1 or searchresult != -1:
|
||||
#print "Cleanup:returned dehyphenated word: " + str(dehyphenated)
|
||||
if self.verbose > 2:
|
||||
self.log(" Cleanup:returned dehyphenated word: " + str(dehyphenated))
|
||||
return dehyphenated
|
||||
elif self.html.find(hyphenated) != -1:
|
||||
#print "Cleanup:returned hyphenated word: " + str(hyphenated)
|
||||
if self.verbose > 2:
|
||||
self.log(" Cleanup:returned hyphenated word: " + str(hyphenated))
|
||||
return hyphenated
|
||||
else:
|
||||
#print "Cleanup:returning original text "+str(firsthalf)+" + linefeed "+str(secondhalf)
|
||||
if self.verbose > 2:
|
||||
self.log(" Cleanup:returning original text "+str(firsthalf)+" + linefeed "+str(secondhalf))
|
||||
return firsthalf+u'\u2014'+wraptags+secondhalf
|
||||
|
||||
else:
|
||||
if self.format == 'individual_words' and len(firsthalf) + len(secondhalf) <= 6:
|
||||
if self.verbose > 2:
|
||||
self.log("too short, returned hyphenated word: " + str(hyphenated))
|
||||
return hyphenated
|
||||
if len(firsthalf) <= 2 and len(secondhalf) <= 2:
|
||||
if self.verbose > 2:
|
||||
self.log("too short, returned hyphenated word: " + str(hyphenated))
|
||||
return hyphenated
|
||||
if self.html.find(lookupword) != -1 or searchresult != -1:
|
||||
#print "returned dehyphenated word: " + str(dehyphenated)
|
||||
if self.verbose > 2:
|
||||
self.log(" returned dehyphenated word: " + str(dehyphenated))
|
||||
return dehyphenated
|
||||
else:
|
||||
#print " returned hyphenated word: " + str(hyphenated)
|
||||
if self.verbose > 2:
|
||||
self.log(" returned hyphenated word: " + str(hyphenated))
|
||||
return hyphenated
|
||||
|
||||
def __call__(self, html, format, length=1):
|
||||
@ -223,10 +248,15 @@ class Dehyphenator(object):
|
||||
intextmatch = re.compile(u'(?<=.{%i})(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)\s*(?=<)(?P<wraptags>(</span>)?\s*(</[iubp]>\s*){1,2}(?P<up2threeblanks><(p|div)[^>]*>\s*(<p[^>]*>\s*</p>\s*)?</(p|div)>\s+){0,3}\s*(<[iubp][^>]*>\s*){1,2}(<span[^>]*>)?)\s*(?P<secondpart>[\w\d]+)' % length)
|
||||
elif format == 'pdf':
|
||||
intextmatch = re.compile(u'(?<=.{%i})(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)\s*(?P<wraptags><p>|</[iub]>\s*<p>\s*<[iub]>)\s*(?P<secondpart>[\w\d]+)'% length)
|
||||
elif format == 'txt':
|
||||
intextmatch = re.compile(u'(?<=.{%i})(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)(\u0020|\u0009)*(?P<wraptags>(\n(\u0020|\u0009)*)+)(?P<secondpart>[\w\d]+)'% length)
|
||||
elif format == 'individual_words':
|
||||
intextmatch = re.compile(u'>[^<]*\b(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)"\s>]+)(-|‐)(?P<secondpart)\w+)\b[^<]*<') # for later, not called anywhere yet
|
||||
intextmatch = re.compile(u'(?!<)(?P<firstpart>\w+)(-|‐)\s*(?P<secondpart>\w+)(?![^<]*?>)')
|
||||
elif format == 'html_cleanup':
|
||||
intextmatch = re.compile(u'(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)\s*(?=<)(?P<wraptags></span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?P<secondpart>[\w\d]+)')
|
||||
elif format == 'txt_cleanup':
|
||||
intextmatch = re.compile(u'(?P<firstpart>\w+)(-|‐)(?P<wraptags>\s+)(?P<secondpart>[\w\d]+)')
|
||||
|
||||
|
||||
html = intextmatch.sub(self.dehyphenate, html)
|
||||
return html
|
||||
@ -353,7 +383,7 @@ class HTMLPreProcessor(object):
|
||||
(re.compile(r'((?<=</a>)\s*file:////?[A-Z].*<br>|file:////?[A-Z].*<br>(?=\s*<hr>))', re.IGNORECASE), lambda match: ''),
|
||||
|
||||
# Center separator lines
|
||||
(re.compile(u'<br>\s*(?P<break>([*#•✦]+\s*)+)\s*<br>'), lambda match: '<p>\n<p style="text-align:center">' + match.group(1) + '</p>'),
|
||||
(re.compile(u'<br>\s*(?P<break>([*#•✦=]+\s*)+)\s*<br>'), lambda match: '<p>\n<p style="text-align:center">' + match.group(1) + '</p>'),
|
||||
|
||||
# Remove page links
|
||||
(re.compile(r'<a name=\d+></a>', re.IGNORECASE), lambda match: ''),
|
||||
@ -390,10 +420,8 @@ class HTMLPreProcessor(object):
|
||||
(re.compile('<span[^><]*?id=subtitle[^><]*?>(.*?)</span>', re.IGNORECASE|re.DOTALL),
|
||||
lambda match : '<h3 class="subtitle">%s</h3>'%(match.group(1),)),
|
||||
]
|
||||
def __init__(self, input_plugin_preprocess, plugin_preprocess,
|
||||
extra_opts=None):
|
||||
self.input_plugin_preprocess = input_plugin_preprocess
|
||||
self.plugin_preprocess = plugin_preprocess
|
||||
def __init__(self, log=None, extra_opts=None):
|
||||
self.log = log
|
||||
self.extra_opts = extra_opts
|
||||
|
||||
def is_baen(self, src):
|
||||
@ -429,27 +457,20 @@ class HTMLPreProcessor(object):
|
||||
if not getattr(self.extra_opts, 'keep_ligatures', False):
|
||||
html = _ligpat.sub(lambda m:LIGATURES[m.group()], html)
|
||||
|
||||
for search, replace in [['sr3_search', 'sr3_replace'], ['sr2_search', 'sr2_replace'], ['sr1_search', 'sr1_replace']]:
|
||||
search_pattern = getattr(self.extra_opts, search, '')
|
||||
if search_pattern:
|
||||
try:
|
||||
search_re = re.compile(search_pattern)
|
||||
replace_txt = getattr(self.extra_opts, replace, '')
|
||||
if not replace_txt:
|
||||
replace_txt = ''
|
||||
rules.insert(0, (search_re, replace_txt))
|
||||
except Exception as e:
|
||||
self.log.error('Failed to parse %r regexp because %s' %
|
||||
(search, as_unicode(e)))
|
||||
|
||||
end_rules = []
|
||||
if getattr(self.extra_opts, 'remove_header', None):
|
||||
try:
|
||||
rules.insert(0,
|
||||
(re.compile(self.extra_opts.header_regex), lambda match : '')
|
||||
)
|
||||
except:
|
||||
import traceback
|
||||
print 'Failed to parse remove_header regexp'
|
||||
traceback.print_exc()
|
||||
|
||||
if getattr(self.extra_opts, 'remove_footer', None):
|
||||
try:
|
||||
rules.insert(0,
|
||||
(re.compile(self.extra_opts.footer_regex), lambda match : '')
|
||||
)
|
||||
except:
|
||||
import traceback
|
||||
print 'Failed to parse remove_footer regexp'
|
||||
traceback.print_exc()
|
||||
|
||||
# delete soft hyphens - moved here so it's executed after header/footer removal
|
||||
if is_pdftohtml:
|
||||
# unwrap/delete soft hyphens
|
||||
@ -457,12 +478,6 @@ class HTMLPreProcessor(object):
|
||||
# unwrap/delete soft hyphens with formatting
|
||||
end_rules.append((re.compile(u'[]\s*(</(i|u|b)>)+(</p>\s*<p>\s*)+\s*(<(i|u|b)>)+\s*(?=[[a-z\d])'), lambda match: ''))
|
||||
|
||||
# Make the more aggressive chapter marking regex optional with the preprocess option to
|
||||
# reduce false positives and move after header/footer removal
|
||||
if getattr(self.extra_opts, 'preprocess_html', None):
|
||||
if is_pdftohtml:
|
||||
end_rules.append((re.compile(r'<p>\s*(?P<chap>(<[ibu]>){0,2}\s*([A-Z \'"!]{3,})\s*([\dA-Z:]+\s){0,4}\s*(</[ibu]>){0,2})\s*<p>\s*(?P<title>(<[ibu]>){0,2}(\s*\w+){1,4}\s*(</[ibu]>){0,2}\s*<p>)?'), chap_head),)
|
||||
|
||||
length = -1
|
||||
if getattr(self.extra_opts, 'unwrap_factor', 0.0) > 0.01:
|
||||
docanalysis = DocAnalysis('pdf', html)
|
||||
@ -473,7 +488,7 @@ class HTMLPreProcessor(object):
|
||||
end_rules.append((re.compile(u'(?<=.{%i}[–—])\s*<p>\s*(?=[[a-z\d])' % length), lambda match: ''))
|
||||
end_rules.append(
|
||||
# Un wrap using punctuation
|
||||
(re.compile(u'(?<=.{%i}([a-zäëïöüàèìòùáćéíóńśúâêîôûçąężıãõñæøþðß,:)\IA\u00DF]|(?<!\&\w{4});))\s*(?P<ital></(i|b|u)>)?\s*(</p>\s*<p>\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines),
|
||||
(re.compile(u'(?<=.{%i}([a-zäëïöüàèìòùáćéíóńśúâêîôûçąężıãõñæøþðßě,:)\IA\u00DF]|(?<!\&\w{4});))\s*(?P<ital></(i|b|u)>)?\s*(</p>\s*<p>\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines),
|
||||
)
|
||||
|
||||
for rule in self.PREPROCESS + start_rules:
|
||||
@ -505,15 +520,14 @@ class HTMLPreProcessor(object):
|
||||
|
||||
if is_pdftohtml and length > -1:
|
||||
# Dehyphenate
|
||||
dehyphenator = Dehyphenator()
|
||||
dehyphenator = Dehyphenator(self.extra_opts.verbose, self.log)
|
||||
html = dehyphenator(html,'html', length)
|
||||
|
||||
if is_pdftohtml:
|
||||
from calibre.ebooks.conversion.utils import PreProcessor
|
||||
pdf_markup = PreProcessor(self.extra_opts, None)
|
||||
from calibre.ebooks.conversion.utils import HeuristicProcessor
|
||||
pdf_markup = HeuristicProcessor(self.extra_opts, None)
|
||||
totalwords = 0
|
||||
totalwords = pdf_markup.get_word_count(html)
|
||||
if totalwords > 7000:
|
||||
if pdf_markup.get_word_count(html) > 7000:
|
||||
html = pdf_markup.markup_chapters(html, totalwords, True)
|
||||
|
||||
#dump(html, 'post-preprocess')
|
||||
@ -533,8 +547,10 @@ class HTMLPreProcessor(object):
|
||||
unidecoder = Unidecoder()
|
||||
html = unidecoder.decode(html)
|
||||
|
||||
if self.plugin_preprocess:
|
||||
html = self.input_plugin_preprocess(self.extra_opts, html)
|
||||
if getattr(self.extra_opts, 'enable_heuristics', False):
|
||||
from calibre.ebooks.conversion.utils import HeuristicProcessor
|
||||
preprocessor = HeuristicProcessor(self.extra_opts, self.log)
|
||||
html = preprocessor(html)
|
||||
|
||||
if getattr(self.extra_opts, 'smarten_punctuation', False):
|
||||
html = self.smarten_punctuation(html)
|
||||
@ -561,8 +577,8 @@ class HTMLPreProcessor(object):
|
||||
html = html.replace(start, '<!--')
|
||||
html = html.replace(stop, '-->')
|
||||
# convert ellipsis to entities to prevent wrapping
|
||||
html = re.sub('(?u)(?<=\w)\s?(\.\s?){2}\.', '…', html)
|
||||
html = re.sub(r'(?u)(?<=\w)\s?(\.\s?){2}\.', '…', html)
|
||||
# convert double dashes to em-dash
|
||||
html = re.sub('\s--\s', u'\u2014', html)
|
||||
html = re.sub(r'\s--\s', u'\u2014', html)
|
||||
return substitute_entites(html)
|
||||
|
||||
|
@ -11,13 +11,22 @@ from calibre.ebooks.conversion.preprocess import DocAnalysis, Dehyphenator
|
||||
from calibre.utils.logging import default_log
|
||||
from calibre.utils.wordcount import get_wordcount_obj
|
||||
|
||||
class PreProcessor(object):
|
||||
class HeuristicProcessor(object):
|
||||
|
||||
def __init__(self, extra_opts=None, log=None):
|
||||
self.log = default_log if log is None else log
|
||||
self.html_preprocess_sections = 0
|
||||
self.found_indents = 0
|
||||
self.extra_opts = extra_opts
|
||||
self.deleted_nbsps = False
|
||||
self.totalwords = 0
|
||||
self.min_chapters = 1
|
||||
self.chapters_no_title = 0
|
||||
self.chapters_with_title = 0
|
||||
self.blanks_deleted = False
|
||||
self.linereg = re.compile('(?<=<p).*?(?=</p>)', re.IGNORECASE|re.DOTALL)
|
||||
self.blankreg = re.compile(r'\s*(?P<openline><p(?!\sid=\"softbreak\")[^>]*>)\s*(?P<closeline></p>)', re.IGNORECASE)
|
||||
self.multi_blank = re.compile(r'(\s*<p[^>]*>\s*</p>){2,}', re.IGNORECASE)
|
||||
|
||||
def is_pdftohtml(self, src):
|
||||
return '<!-- created by calibre\'s pdftohtml -->' in src[:1000]
|
||||
@ -27,12 +36,12 @@ class PreProcessor(object):
|
||||
title = match.group('title')
|
||||
if not title:
|
||||
self.html_preprocess_sections = self.html_preprocess_sections + 1
|
||||
self.log("marked " + unicode(self.html_preprocess_sections) +
|
||||
self.log.debug("marked " + unicode(self.html_preprocess_sections) +
|
||||
" chapters. - " + unicode(chap))
|
||||
return '<h2>'+chap+'</h2>\n'
|
||||
else:
|
||||
self.html_preprocess_sections = self.html_preprocess_sections + 1
|
||||
self.log("marked " + unicode(self.html_preprocess_sections) +
|
||||
self.log.debug("marked " + unicode(self.html_preprocess_sections) +
|
||||
" chapters & titles. - " + unicode(chap) + ", " + unicode(title))
|
||||
return '<h2>'+chap+'</h2>\n<h3>'+title+'</h3>\n'
|
||||
|
||||
@ -40,10 +49,18 @@ class PreProcessor(object):
|
||||
chap = match.group('section')
|
||||
styles = match.group('styles')
|
||||
self.html_preprocess_sections = self.html_preprocess_sections + 1
|
||||
self.log("marked " + unicode(self.html_preprocess_sections) +
|
||||
self.log.debug("marked " + unicode(self.html_preprocess_sections) +
|
||||
" section markers based on punctuation. - " + unicode(chap))
|
||||
return '<'+styles+' style="page-break-before:always">'+chap
|
||||
|
||||
def analyze_title_matches(self, match):
|
||||
#chap = match.group('chap')
|
||||
title = match.group('title')
|
||||
if not title:
|
||||
self.chapters_no_title = self.chapters_no_title + 1
|
||||
else:
|
||||
self.chapters_with_title = self.chapters_with_title + 1
|
||||
|
||||
def insert_indent(self, match):
|
||||
pstyle = match.group('formatting')
|
||||
span = match.group('span')
|
||||
@ -75,8 +92,8 @@ class PreProcessor(object):
|
||||
line_end = line_end_ere.findall(raw)
|
||||
tot_htm_ends = len(htm_end)
|
||||
tot_ln_fds = len(line_end)
|
||||
self.log("There are " + unicode(tot_ln_fds) + " total Line feeds, and " +
|
||||
unicode(tot_htm_ends) + " marked up endings")
|
||||
#self.log.debug("There are " + unicode(tot_ln_fds) + " total Line feeds, and " +
|
||||
# unicode(tot_htm_ends) + " marked up endings")
|
||||
|
||||
if percent > 1:
|
||||
percent = 1
|
||||
@ -84,9 +101,8 @@ class PreProcessor(object):
|
||||
percent = 0
|
||||
|
||||
min_lns = tot_ln_fds * percent
|
||||
self.log("There must be fewer than " + unicode(min_lns) + " unmarked lines to add markup")
|
||||
if min_lns > tot_htm_ends:
|
||||
return True
|
||||
#self.log.debug("There must be fewer than " + unicode(min_lns) + " unmarked lines to add markup")
|
||||
return min_lns > tot_htm_ends
|
||||
|
||||
def dump(self, raw, where):
|
||||
import os
|
||||
@ -112,16 +128,55 @@ class PreProcessor(object):
|
||||
wordcount = get_wordcount_obj(word_count_text)
|
||||
return wordcount.words
|
||||
|
||||
def markup_italicis(self, html):
|
||||
ITALICIZE_WORDS = [
|
||||
'Etc.', 'etc.', 'viz.', 'ie.', 'i.e.', 'Ie.', 'I.e.', 'eg.',
|
||||
'e.g.', 'Eg.', 'E.g.', 'et al.', 'et cetera', 'n.b.', 'N.b.',
|
||||
'nota bene', 'Nota bene', 'Ste.', 'Mme.', 'Mdme.',
|
||||
'Mlle.', 'Mons.', 'PS.', 'PPS.',
|
||||
]
|
||||
|
||||
ITALICIZE_STYLE_PATS = [
|
||||
r'(?msu)(?<=\s)_(?P<words>\S[^_]{0,40}?\S)?_(?=\s)',
|
||||
r'(?msu)(?<=\s)/(?P<words>\S[^/]{0,40}?\S)?/(?=\s)',
|
||||
r'(?msu)(?<=\s)~~(?P<words>\S[^~]{0,40}?\S)?~~(?=\s)',
|
||||
r'(?msu)(?<=\s)\*(?P<words>\S[^\*]{0,40}?\S)?\*(?=\s)',
|
||||
r'(?msu)(?<=\s)~(?P<words>\S[^~]{0,40}?\S)?~(?=\s)',
|
||||
r'(?msu)(?<=\s)_/(?P<words>\S[^/_]{0,40}?\S)?/_(?=\s)',
|
||||
r'(?msu)(?<=\s)_\*(?P<words>\S[^\*_]{0,40}?\S)?\*_(?=\s)',
|
||||
r'(?msu)(?<=\s)\*/(?P<words>\S[^/\*]{0,40}?\S)?/\*(?=\s)',
|
||||
r'(?msu)(?<=\s)_\*/(?P<words>\S[^\*_]{0,40}?\S)?/\*_(?=\s)',
|
||||
r'(?msu)(?<=\s)/:(?P<words>\S[^:/]{0,40}?\S)?:/(?=\s)',
|
||||
r'(?msu)(?<=\s)\|:(?P<words>\S[^:\|]{0,40}?\S)?:\|(?=\s)',
|
||||
]
|
||||
|
||||
for word in ITALICIZE_WORDS:
|
||||
html = html.replace(word, '<i>%s</i>' % word)
|
||||
|
||||
for pat in ITALICIZE_STYLE_PATS:
|
||||
html = re.sub(pat, lambda mo: '<i>%s</i>' % mo.group('words'), html)
|
||||
|
||||
return html
|
||||
|
||||
def markup_chapters(self, html, wordcount, blanks_between_paragraphs):
|
||||
'''
|
||||
Searches for common chapter headings throughout the document
|
||||
attempts multiple patterns based on likelihood of a match
|
||||
with minimum false positives. Exits after finding a successful pattern
|
||||
'''
|
||||
# Typical chapters are between 2000 and 7000 words, use the larger number to decide the
|
||||
# minimum of chapters to search for
|
||||
self.min_chapters = 1
|
||||
# minimum of chapters to search for. A max limit is calculated to prevent things like OCR
|
||||
# or pdf page numbers from being treated as TOC markers
|
||||
max_chapters = 150
|
||||
typical_chapters = 7000.
|
||||
if wordcount > 7000:
|
||||
self.min_chapters = int(ceil(wordcount / 7000.))
|
||||
#print "minimum chapters required are: "+str(self.min_chapters)
|
||||
if wordcount > 200000:
|
||||
typical_chapters = 15000.
|
||||
self.min_chapters = int(ceil(wordcount / typical_chapters))
|
||||
self.log.debug("minimum chapters required are: "+str(self.min_chapters))
|
||||
heading = re.compile('<h[1-3][^>]*>', re.IGNORECASE)
|
||||
self.html_preprocess_sections = len(heading.findall(html))
|
||||
self.log("found " + unicode(self.html_preprocess_sections) + " pre-existing headings")
|
||||
self.log.debug("found " + unicode(self.html_preprocess_sections) + " pre-existing headings")
|
||||
|
||||
# Build the Regular Expressions in pieces
|
||||
init_lookahead = "(?=<(p|div))"
|
||||
@ -151,88 +206,160 @@ class PreProcessor(object):
|
||||
n_lookahead_open = "\s+(?!"
|
||||
n_lookahead_close = ")"
|
||||
|
||||
default_title = r"(<[ibu][^>]*>)?\s{0,3}([\w\'\"-]+\s{0,3}){1,5}?(</[ibu][^>]*>)?(?=<)"
|
||||
default_title = r"(<[ibu][^>]*>)?\s{0,3}(?!Chapter)([\w\:\'’\"-]+\s{0,3}){1,5}?(</[ibu][^>]*>)?(?=<)"
|
||||
simple_title = r"(<[ibu][^>]*>)?\s{0,3}(?!(Chapter|\s+<)).{0,65}?(</[ibu][^>]*>)?(?=<)"
|
||||
|
||||
analysis_result = []
|
||||
|
||||
chapter_types = [
|
||||
[r"[^'\"]?(Introduction|Synopsis|Acknowledgements|Chapter|Kapitel|Epilogue|Volume\s|Prologue|Book\s|Part\s|Dedication|Preface)\s*([\d\w-]+\:?\s*){0,4}", True, "Searching for common Chapter Headings"],
|
||||
[r"<b[^>]*>\s*(<span[^>]*>)?\s*(?!([*#•]+\s*)+)(\s*(?=[\d.\w#\-*\s]+<)([\d.\w#-*]+\s*){1,5}\s*)(?!\.)(</span>)?\s*</b>", True, "Searching for emphasized lines"], # Emphasized lines
|
||||
[r"[^'\"]?(\d+(\.|:)|CHAPTER)\s*([\dA-Z\-\'\"#,]+\s*){0,7}\s*", True, "Searching for numeric chapter headings"], # Numeric Chapters
|
||||
[r"([A-Z]\s+){3,}\s*([\d\w-]+\s*){0,3}\s*", True, "Searching for letter spaced headings"], # Spaced Lettering
|
||||
[r"[^'\"]?(\d+\.?\s+([\d\w-]+\:?\'?-?\s?){0,5})\s*", True, "Searching for numeric chapters with titles"], # Numeric Titles
|
||||
[r"[^'\"]?(\d+|CHAPTER)\s*([\dA-Z\-\'\"\?!#,]+\s*){0,7}\s*", True, "Searching for simple numeric chapter headings"], # Numeric Chapters, no dot or colon
|
||||
[r"\s*[^'\"]?([A-Z#]+(\s|-){0,3}){1,5}\s*", False, "Searching for chapters with Uppercase Characters" ] # Uppercase Chapters
|
||||
[r"[^'\"]?(Introduction|Synopsis|Acknowledgements|Epilogue|CHAPTER|Kapitel|Volume\b|Prologue|Book\b|Part\b|Dedication|Preface)\s*([\d\w-]+\:?\'?\s*){0,5}", True, True, True, False, "Searching for common section headings", 'common'],
|
||||
[r"[^'\"]?(CHAPTER|Kapitel)\s*([\dA-Z\-\'\"\?!#,]+\s*){0,7}\s*", True, True, True, False, "Searching for most common chapter headings", 'chapter'], # Highest frequency headings which include titles
|
||||
[r"<b[^>]*>\s*(<span[^>]*>)?\s*(?!([*#•=]+\s*)+)(\s*(?=[\d.\w#\-*\s]+<)([\d.\w#-*]+\s*){1,5}\s*)(?!\.)(</span>)?\s*</b>", True, True, True, False, "Searching for emphasized lines", 'emphasized'], # Emphasized lines
|
||||
[r"[^'\"]?(\d+(\.|:))\s*([\dA-Z\-\'\"#,]+\s*){0,7}\s*", True, True, True, False, "Searching for numeric chapter headings", 'numeric'], # Numeric Chapters
|
||||
[r"([A-Z]\s+){3,}\s*([\d\w-]+\s*){0,3}\s*", True, True, True, False, "Searching for letter spaced headings", 'letter_spaced'], # Spaced Lettering
|
||||
[r"[^'\"]?(\d+\.?\s+([\d\w-]+\:?\'?-?\s?){0,5})\s*", True, True, True, False, "Searching for numeric chapters with titles", 'numeric_title'], # Numeric Titles
|
||||
[r"[^'\"]?(\d+)\s*([\dA-Z\-\'\"\?!#,]+\s*){0,7}\s*", True, True, True, False, "Searching for simple numeric headings", 'plain_number'], # Numeric Chapters, no dot or colon
|
||||
[r"\s*[^'\"]?([A-Z#]+(\s|-){0,3}){1,5}\s*", False, True, False, False, "Searching for chapters with Uppercase Characters", 'uppercase' ] # Uppercase Chapters
|
||||
]
|
||||
|
||||
def recurse_patterns(html, analyze):
|
||||
# Start with most typical chapter headings, get more aggressive until one works
|
||||
for [chapter_type, lookahead_ignorecase, log_message] in chapter_types:
|
||||
for [chapter_type, n_lookahead_req, strict_title, ignorecase, title_req, log_message, type_name] in chapter_types:
|
||||
n_lookahead = ''
|
||||
hits = 0
|
||||
self.chapters_no_title = 0
|
||||
self.chapters_with_title = 0
|
||||
|
||||
if n_lookahead_req:
|
||||
lp_n_lookahead_open = n_lookahead_open
|
||||
lp_n_lookahead_close = n_lookahead_close
|
||||
else:
|
||||
lp_n_lookahead_open = ''
|
||||
lp_n_lookahead_close = ''
|
||||
|
||||
if strict_title:
|
||||
lp_title = default_title
|
||||
else:
|
||||
lp_title = simple_title
|
||||
|
||||
if ignorecase:
|
||||
arg_ignorecase = r'(?i)'
|
||||
else:
|
||||
arg_ignorecase = ''
|
||||
|
||||
if title_req:
|
||||
lp_opt_title_open = ''
|
||||
lp_opt_title_close = ''
|
||||
else:
|
||||
lp_opt_title_open = opt_title_open
|
||||
lp_opt_title_close = opt_title_close
|
||||
|
||||
if self.html_preprocess_sections >= self.min_chapters:
|
||||
break
|
||||
full_chapter_line = chapter_line_open+chapter_header_open+chapter_type+chapter_header_close+chapter_line_close
|
||||
if n_lookahead_req:
|
||||
n_lookahead = re.sub("(ou|in|cha)", "lookahead_", full_chapter_line)
|
||||
self.log("Marked " + unicode(self.html_preprocess_sections) + " headings, " + log_message)
|
||||
if lookahead_ignorecase:
|
||||
chapter_marker = init_lookahead+full_chapter_line+blank_lines+n_lookahead_open+n_lookahead+n_lookahead_close+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close
|
||||
chapdetect = re.compile(r'%s' % chapter_marker, re.IGNORECASE)
|
||||
if not analyze:
|
||||
self.log.debug("Marked " + unicode(self.html_preprocess_sections) + " headings, " + log_message)
|
||||
|
||||
chapter_marker = arg_ignorecase+init_lookahead+full_chapter_line+blank_lines+lp_n_lookahead_open+n_lookahead+lp_n_lookahead_close+lp_opt_title_open+title_line_open+title_header_open+lp_title+title_header_close+title_line_close+lp_opt_title_close
|
||||
chapdetect = re.compile(r'%s' % chapter_marker)
|
||||
|
||||
if analyze:
|
||||
hits = len(chapdetect.findall(html))
|
||||
if hits:
|
||||
chapdetect.sub(self.analyze_title_matches, html)
|
||||
if float(self.chapters_with_title) / float(hits) > .5:
|
||||
title_req = True
|
||||
strict_title = False
|
||||
self.log.debug(unicode(type_name)+" had "+unicode(hits)+" hits - "+unicode(self.chapters_no_title)+" chapters with no title, "+unicode(self.chapters_with_title)+" chapters with titles, "+unicode(float(self.chapters_with_title) / float(hits))+" percent. ")
|
||||
if type_name == 'common':
|
||||
analysis_result.append([chapter_type, n_lookahead_req, strict_title, ignorecase, title_req, log_message, type_name])
|
||||
elif self.min_chapters <= hits < max_chapters:
|
||||
analysis_result.append([chapter_type, n_lookahead_req, strict_title, ignorecase, title_req, log_message, type_name])
|
||||
break
|
||||
else:
|
||||
chapter_marker = init_lookahead+full_chapter_line+blank_lines+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close+n_lookahead_open+n_lookahead+n_lookahead_close
|
||||
chapdetect = re.compile(r'%s' % chapter_marker, re.UNICODE)
|
||||
html = chapdetect.sub(self.chapter_head, html)
|
||||
return html
|
||||
|
||||
recurse_patterns(html, True)
|
||||
chapter_types = analysis_result
|
||||
html = recurse_patterns(html, False)
|
||||
|
||||
words_per_chptr = wordcount
|
||||
if words_per_chptr > 0 and self.html_preprocess_sections > 0:
|
||||
words_per_chptr = wordcount / self.html_preprocess_sections
|
||||
self.log("Total wordcount is: "+ str(wordcount)+", Average words per section is: "+str(words_per_chptr)+", Marked up "+str(self.html_preprocess_sections)+" chapters")
|
||||
self.log.debug("Total wordcount is: "+ str(wordcount)+", Average words per section is: "+str(words_per_chptr)+", Marked up "+str(self.html_preprocess_sections)+" chapters")
|
||||
return html
|
||||
|
||||
def punctuation_unwrap(self, length, content, format):
|
||||
'''
|
||||
Unwraps lines based on line length and punctuation
|
||||
supports a range of html markup and text files
|
||||
'''
|
||||
# define the pieces of the regex
|
||||
lookahead = "(?<=.{"+str(length)+u"}([a-zäëïöüàèìòùáćéíóńśúâêîôûçąężıãõñæøþðßě,:)\IA\u00DF]|(?<!\&\w{4});))" # (?<!\&\w{4});) is a semicolon not part of an entity
|
||||
em_en_lookahead = "(?<=.{"+str(length)+u"}[\u2013\u2014])"
|
||||
soft_hyphen = u"\xad"
|
||||
line_ending = "\s*</(span|[iubp]|div)>\s*(</(span|[iubp]|div)>)?"
|
||||
blanklines = "\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*"
|
||||
line_opening = "<(span|[iubp]|div)[^>]*>\s*(<(span|[iubp]|div)[^>]*>)?\s*"
|
||||
txt_line_wrap = u"((\u0020|\u0009)*\n){1,4}"
|
||||
|
||||
unwrap_regex = lookahead+line_ending+blanklines+line_opening
|
||||
em_en_unwrap_regex = em_en_lookahead+line_ending+blanklines+line_opening
|
||||
shy_unwrap_regex = soft_hyphen+line_ending+blanklines+line_opening
|
||||
|
||||
def __call__(self, html):
|
||||
self.log("********* Preprocessing HTML *********")
|
||||
if format == 'txt':
|
||||
unwrap_regex = lookahead+txt_line_wrap
|
||||
em_en_unwrap_regex = em_en_lookahead+txt_line_wrap
|
||||
shy_unwrap_regex = soft_hyphen+txt_line_wrap
|
||||
|
||||
# Count the words in the document to estimate how many chapters to look for and whether
|
||||
# other types of processing are attempted
|
||||
totalwords = 0
|
||||
totalwords = self.get_word_count(html)
|
||||
unwrap = re.compile(u"%s" % unwrap_regex, re.UNICODE)
|
||||
em_en_unwrap = re.compile(u"%s" % em_en_unwrap_regex, re.UNICODE)
|
||||
shy_unwrap = re.compile(u"%s" % shy_unwrap_regex, re.UNICODE)
|
||||
|
||||
if totalwords < 20:
|
||||
self.log("not enough text, not preprocessing")
|
||||
return html
|
||||
content = unwrap.sub(' ', content)
|
||||
content = em_en_unwrap.sub('', content)
|
||||
content = shy_unwrap.sub('', content)
|
||||
return content
|
||||
|
||||
# Arrange line feeds and </p> tags so the line_length and no_markup functions work correctly
|
||||
html = re.sub(r"\s*</(?P<tag>p|div)>", "</"+"\g<tag>"+">\n", html)
|
||||
html = re.sub(r"\s*<(?P<tag>p|div)(?P<style>[^>]*)>\s*", "\n<"+"\g<tag>"+"\g<style>"+">", html)
|
||||
|
||||
###### Check Markup ######
|
||||
#
|
||||
# some lit files don't have any <p> tags or equivalent (generally just plain text between
|
||||
# <pre> tags), check and mark up line endings if required before proceeding
|
||||
if self.no_markup(html, 0.1):
|
||||
self.log("not enough paragraph markers, adding now")
|
||||
# check if content is in pre tags, use txt processor to mark up if so
|
||||
pre = re.compile(r'<pre>', re.IGNORECASE)
|
||||
if len(pre.findall(html)) == 1:
|
||||
self.log("Running Text Processing")
|
||||
def txt_process(self, match):
|
||||
from calibre.ebooks.txt.processor import convert_basic, preserve_spaces, \
|
||||
separate_paragraphs_single_line
|
||||
outerhtml = re.compile(r'.*?(?<=<pre>)(?P<text>.*)(?=</pre>).*', re.IGNORECASE|re.DOTALL)
|
||||
html = outerhtml.sub('\g<text>', html)
|
||||
html = separate_paragraphs_single_line(html)
|
||||
html = preserve_spaces(html)
|
||||
html = convert_basic(html, epub_split_size_kb=0)
|
||||
content = match.group('text')
|
||||
content = separate_paragraphs_single_line(content)
|
||||
content = preserve_spaces(content)
|
||||
content = convert_basic(content, epub_split_size_kb=0)
|
||||
return content
|
||||
|
||||
def markup_pre(self, html):
|
||||
pre = re.compile(r'<pre>', re.IGNORECASE)
|
||||
if len(pre.findall(html)) >= 1:
|
||||
self.log.debug("Running Text Processing")
|
||||
outerhtml = re.compile(r'.*?(?<=<pre>)(?P<text>.*?)</pre>', re.IGNORECASE|re.DOTALL)
|
||||
html = outerhtml.sub(self.txt_process, html)
|
||||
else:
|
||||
# Add markup naively
|
||||
# TODO - find out if there are cases where there are more than one <pre> tag or
|
||||
# other types of unmarked html and handle them in some better fashion
|
||||
add_markup = re.compile('(?<!>)(\n)')
|
||||
html = add_markup.sub('</p>\n<p>', html)
|
||||
return html
|
||||
|
||||
###### Mark Indents/Cleanup ######
|
||||
#
|
||||
# Replace series of non-breaking spaces with text-indent
|
||||
def arrange_htm_line_endings(self, html):
|
||||
html = re.sub(r"\s*</(?P<tag>p|div)>", "</"+"\g<tag>"+">\n", html)
|
||||
html = re.sub(r"\s*<(?P<tag>p|div)(?P<style>[^>]*)>\s*", "\n<"+"\g<tag>"+"\g<style>"+">", html)
|
||||
return html
|
||||
|
||||
def fix_nbsp_indents(self, html):
|
||||
txtindent = re.compile(ur'<p(?P<formatting>[^>]*)>\s*(?P<span>(<span[^>]*>\s*)+)?\s*(\u00a0){2,}', re.IGNORECASE)
|
||||
html = txtindent.sub(self.insert_indent, html)
|
||||
if self.found_indents > 1:
|
||||
self.log("replaced "+unicode(self.found_indents)+ " nbsp indents with inline styles")
|
||||
self.log.debug("replaced "+unicode(self.found_indents)+ " nbsp indents with inline styles")
|
||||
return html
|
||||
|
||||
def cleanup_markup(self, html):
|
||||
# remove remaining non-breaking spaces
|
||||
html = re.sub(ur'\u00a0', ' ', html)
|
||||
# Get rid of various common microsoft specific tags which can cause issues later
|
||||
@ -240,109 +367,166 @@ class PreProcessor(object):
|
||||
html = re.sub(ur'\s*<o:p>\s*</o:p>', ' ', html)
|
||||
# Delete microsoft 'smart' tags
|
||||
html = re.sub('(?i)</?st1:\w+>', '', html)
|
||||
# Get rid of empty span, bold, & italics tags
|
||||
# Get rid of empty span, bold, font, em, & italics tags
|
||||
html = re.sub(r"\s*<span[^>]*>\s*(<span[^>]*>\s*</span>){0,2}\s*</span>\s*", " ", html)
|
||||
html = re.sub(r"\s*<[ibu][^>]*>\s*(<[ibu][^>]*>\s*</[ibu]>\s*){0,2}\s*</[ibu]>", " ", html)
|
||||
html = re.sub(r"\s*<(font|[ibu]|em)[^>]*>\s*(<(font|[ibu]|em)[^>]*>\s*</(font|[ibu]|em)>\s*){0,2}\s*</(font|[ibu]|em)>", " ", html)
|
||||
html = re.sub(r"\s*<span[^>]*>\s*(<span[^>]>\s*</span>){0,2}\s*</span>\s*", " ", html)
|
||||
# ADE doesn't render <br />, change to empty paragraphs
|
||||
#html = re.sub('<br[^>]*>', u'<p>\u00a0</p>', html)
|
||||
html = re.sub(r"\s*<(font|[ibu]|em)[^>]*>\s*(<(font|[ibu]|em)[^>]*>\s*</(font|[ibu]|em)>\s*){0,2}\s*</(font|[ibu]|em)>", " ", html)
|
||||
self.deleted_nbsps = True
|
||||
return html
|
||||
|
||||
# If more than 40% of the lines are empty paragraphs and the user has enabled remove
|
||||
# paragraph spacing then delete blank lines to clean up spacing
|
||||
linereg = re.compile('(?<=<p).*?(?=</p>)', re.IGNORECASE|re.DOTALL)
|
||||
blankreg = re.compile(r'\s*(?P<openline><p[^>]*>)\s*(?P<closeline></p>)', re.IGNORECASE)
|
||||
#multi_blank = re.compile(r'(\s*<p[^>]*>\s*(<(b|i|u)>)?\s*(</(b|i|u)>)?\s*</p>){2,}', re.IGNORECASE)
|
||||
blanklines = blankreg.findall(html)
|
||||
lines = linereg.findall(html)
|
||||
blanks_between_paragraphs = False
|
||||
if len(lines) > 1:
|
||||
self.log("There are " + unicode(len(blanklines)) + " blank lines. " +
|
||||
unicode(float(len(blanklines)) / float(len(lines))) + " percent blank")
|
||||
if float(len(blanklines)) / float(len(lines)) > 0.40 and getattr(self.extra_opts,
|
||||
'remove_paragraph_spacing', False):
|
||||
self.log("deleting blank lines")
|
||||
html = blankreg.sub('', html)
|
||||
elif float(len(blanklines)) / float(len(lines)) > 0.40:
|
||||
blanks_between_paragraphs = True
|
||||
#print "blanks between paragraphs is marked True"
|
||||
else:
|
||||
blanks_between_paragraphs = False
|
||||
|
||||
#self.dump(html, 'before_chapter_markup')
|
||||
# detect chapters/sections to match xpath or splitting logic
|
||||
#
|
||||
|
||||
html = self.markup_chapters(html, totalwords, blanks_between_paragraphs)
|
||||
|
||||
|
||||
###### Unwrap lines ######
|
||||
#
|
||||
# Some OCR sourced files have line breaks in the html using a combination of span & p tags
|
||||
# span are used for hard line breaks, p for new paragraphs. Determine which is used so
|
||||
# that lines can be un-wrapped across page boundaries
|
||||
def analyze_line_endings(self, html):
|
||||
'''
|
||||
determines the type of html line ending used most commonly in a document
|
||||
use before calling docanalysis functions
|
||||
'''
|
||||
paras_reg = re.compile('<p[^>]*>', re.IGNORECASE)
|
||||
spans_reg = re.compile('<span[^>]*>', re.IGNORECASE)
|
||||
paras = len(paras_reg.findall(html))
|
||||
spans = len(spans_reg.findall(html))
|
||||
if spans > 1:
|
||||
if float(paras) / float(spans) < 0.75:
|
||||
format = 'spanned_html'
|
||||
return 'spanned_html'
|
||||
else:
|
||||
format = 'html'
|
||||
return 'html'
|
||||
else:
|
||||
format = 'html'
|
||||
return 'html'
|
||||
|
||||
def analyze_blanks(self, html):
|
||||
blanklines = self.blankreg.findall(html)
|
||||
lines = self.linereg.findall(html)
|
||||
if len(lines) > 1:
|
||||
self.log.debug("There are " + unicode(len(blanklines)) + " blank lines. " +
|
||||
unicode(float(len(blanklines)) / float(len(lines))) + " percent blank")
|
||||
|
||||
if float(len(blanklines)) / float(len(lines)) > 0.40:
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
def cleanup_required(self):
|
||||
for option in ['unwrap_lines', 'markup_chapter_headings', 'format_scene_breaks', 'delete_blank_paragraphs']:
|
||||
if getattr(self.extra_opts, option, False):
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def __call__(self, html):
|
||||
self.log.debug("********* Heuristic processing HTML *********")
|
||||
|
||||
# Count the words in the document to estimate how many chapters to look for and whether
|
||||
# other types of processing are attempted
|
||||
try:
|
||||
self.totalwords = self.get_word_count(html)
|
||||
except:
|
||||
self.log.warn("Can't get wordcount")
|
||||
|
||||
if self.totalwords < 50:
|
||||
self.log.warn("flow is too short, not running heuristics")
|
||||
return html
|
||||
|
||||
# Arrange line feeds and </p> tags so the line_length and no_markup functions work correctly
|
||||
html = self.arrange_htm_line_endings(html)
|
||||
|
||||
if self.cleanup_required():
|
||||
###### Check Markup ######
|
||||
#
|
||||
# some lit files don't have any <p> tags or equivalent (generally just plain text between
|
||||
# <pre> tags), check and mark up line endings if required before proceeding
|
||||
# fix indents must run after this step
|
||||
if self.no_markup(html, 0.1):
|
||||
self.log.debug("not enough paragraph markers, adding now")
|
||||
# markup using text processing
|
||||
html = self.markup_pre(html)
|
||||
|
||||
# Replace series of non-breaking spaces with text-indent
|
||||
if getattr(self.extra_opts, 'fix_indents', False):
|
||||
html = self.fix_nbsp_indents(html)
|
||||
|
||||
if self.cleanup_required():
|
||||
# fix indents must run before this step, as it removes non-breaking spaces
|
||||
html = self.cleanup_markup(html)
|
||||
|
||||
# ADE doesn't render <br />, change to empty paragraphs
|
||||
#html = re.sub('<br[^>]*>', u'<p>\u00a0</p>', html)
|
||||
|
||||
# Determine whether the document uses interleaved blank lines
|
||||
blanks_between_paragraphs = self.analyze_blanks(html)
|
||||
|
||||
#self.dump(html, 'before_chapter_markup')
|
||||
# detect chapters/sections to match xpath or splitting logic
|
||||
|
||||
if getattr(self.extra_opts, 'markup_chapter_headings', False):
|
||||
html = self.markup_chapters(html, self.totalwords, blanks_between_paragraphs)
|
||||
|
||||
if getattr(self.extra_opts, 'italicize_common_cases', False):
|
||||
html = self.markup_italicis(html)
|
||||
|
||||
# If more than 40% of the lines are empty paragraphs and the user has enabled delete
|
||||
# blank paragraphs then delete blank lines to clean up spacing
|
||||
if blanks_between_paragraphs and getattr(self.extra_opts, 'delete_blank_paragraphs', False):
|
||||
self.log.debug("deleting blank lines")
|
||||
self.blanks_deleted = True
|
||||
html = self.multi_blank.sub('\n<p id="softbreak" style="margin-top:1.5em; margin-bottom:1.5em"> </p>', html)
|
||||
html = self.blankreg.sub('', html)
|
||||
|
||||
# Determine line ending type
|
||||
# Some OCR sourced files have line breaks in the html using a combination of span & p tags
|
||||
# span are used for hard line breaks, p for new paragraphs. Determine which is used so
|
||||
# that lines can be un-wrapped across page boundaries
|
||||
format = self.analyze_line_endings(html)
|
||||
|
||||
# Check Line histogram to determine if the document uses hard line breaks, If 50% or
|
||||
# more of the lines break in the same region of the document then unwrapping is required
|
||||
docanalysis = DocAnalysis(format, html)
|
||||
hardbreaks = docanalysis.line_histogram(.50)
|
||||
self.log("Hard line breaks check returned "+unicode(hardbreaks))
|
||||
self.log.debug("Hard line breaks check returned "+unicode(hardbreaks))
|
||||
|
||||
# Calculate Length
|
||||
unwrap_factor = getattr(self.extra_opts, 'html_unwrap_factor', 0.4)
|
||||
length = docanalysis.line_length(unwrap_factor)
|
||||
self.log("Median line length is " + unicode(length) + ", calculated with " + format + " format")
|
||||
self.log.debug("Median line length is " + unicode(length) + ", calculated with " + format + " format")
|
||||
|
||||
###### Unwrap lines ######
|
||||
if getattr(self.extra_opts, 'unwrap_lines', False):
|
||||
# only go through unwrapping code if the histogram shows unwrapping is required or if the user decreased the default unwrap_factor
|
||||
if hardbreaks or unwrap_factor < 0.4:
|
||||
self.log("Unwrapping required, unwrapping Lines")
|
||||
# Unwrap em/en dashes
|
||||
html = re.sub(u'(?<=.{%i}[\u2013\u2014])\s*(?=<)(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?=[[a-z\d])' % length, '', html)
|
||||
# Dehyphenate
|
||||
self.log("Unwrapping/Removing hyphens")
|
||||
dehyphenator = Dehyphenator()
|
||||
self.log.debug("Unwrapping required, unwrapping Lines")
|
||||
# Dehyphenate with line length limiters
|
||||
dehyphenator = Dehyphenator(self.extra_opts.verbose, self.log)
|
||||
html = dehyphenator(html,'html', length)
|
||||
self.log("Done dehyphenating")
|
||||
# Unwrap lines using punctation and line length
|
||||
#unwrap_quotes = re.compile(u"(?<=.{%i}\"')\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*(?=[a-z])" % length, re.UNICODE)
|
||||
unwrap = re.compile(u"(?<=.{%i}([a-zäëïöüàèìòùáćéíóńśúâêîôûçąężıãõñæøþðß,:)\IA\u00DF]|(?<!\&\w{4});))\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE)
|
||||
html = unwrap.sub(' ', html)
|
||||
#check any remaining hyphens, but only unwrap if there is a match
|
||||
dehyphenator = Dehyphenator()
|
||||
html = dehyphenator(html,'html_cleanup', length)
|
||||
else:
|
||||
# dehyphenate in cleanup mode to fix anything previous conversions/editing missed
|
||||
self.log("Cleaning up hyphenation")
|
||||
dehyphenator = Dehyphenator()
|
||||
html = dehyphenator(html,'html_cleanup', length)
|
||||
self.log("Done dehyphenating")
|
||||
html = self.punctuation_unwrap(length, html, 'html')
|
||||
|
||||
# delete soft hyphens
|
||||
html = re.sub(u'\xad\s*(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*', '', html)
|
||||
if getattr(self.extra_opts, 'dehyphenate', False):
|
||||
# dehyphenate in cleanup mode to fix anything previous conversions/editing missed
|
||||
self.log.debug("Fixing hyphenated content")
|
||||
dehyphenator = Dehyphenator(self.extra_opts.verbose, self.log)
|
||||
html = dehyphenator(html,'html_cleanup', length)
|
||||
html = dehyphenator(html, 'individual_words', length)
|
||||
|
||||
# If still no sections after unwrapping mark split points on lines with no punctuation
|
||||
if self.html_preprocess_sections < self.min_chapters:
|
||||
self.log("Looking for more split points based on punctuation,"
|
||||
if self.html_preprocess_sections < self.min_chapters and getattr(self.extra_opts, 'markup_chapter_headings', False):
|
||||
self.log.debug("Looking for more split points based on punctuation,"
|
||||
" currently have " + unicode(self.html_preprocess_sections))
|
||||
chapdetect3 = re.compile(r'<(?P<styles>(p|div)[^>]*)>\s*(?P<section>(<span[^>]*>)?\s*(?!([*#•]+\s*)+)(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*.?(?=[a-z#\-*\s]+<)([a-z#-*]+\s*){1,5}\s*\s*(</span>)?(</[ibu]>){0,2}\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</span>)?\s*</(p|div)>)', re.IGNORECASE)
|
||||
html = chapdetect3.sub(self.chapter_break, html)
|
||||
|
||||
if getattr(self.extra_opts, 'renumber_headings', False):
|
||||
# search for places where a first or second level heading is immediately followed by another
|
||||
# top level heading. demote the second heading to h3 to prevent splitting between chapter
|
||||
# headings and titles, images, etc
|
||||
doubleheading = re.compile(r'(?P<firsthead><h(1|2)[^>]*>.+?</h(1|2)>\s*(<(?!h\d)[^>]*>\s*)*)<h(1|2)(?P<secondhead>[^>]*>.+?)</h(1|2)>', re.IGNORECASE)
|
||||
html = doubleheading.sub('\g<firsthead>'+'\n<h3'+'\g<secondhead>'+'</h3>', html)
|
||||
|
||||
# put back non-breaking spaces in empty paragraphs to preserve original formatting
|
||||
html = blankreg.sub('\n'+r'\g<openline>'+u'\u00a0'+r'\g<closeline>', html)
|
||||
|
||||
if getattr(self.extra_opts, 'format_scene_breaks', False):
|
||||
# Center separator lines
|
||||
html = re.sub(u'<p>\s*(?P<break>([*#•]+\s*)+)\s*</p>', '<p style="text-align:center">' + '\g<break>' + '</p>', html)
|
||||
html = re.sub(u'<(?P<outer>p|div)[^>]*>\s*(<(?P<inner1>font|span|[ibu])[^>]*>)?\s*(<(?P<inner2>font|span|[ibu])[^>]*>)?\s*(<(?P<inner3>font|span|[ibu])[^>]*>)?\s*(?P<break>([*#•=✦]+\s*)+)\s*(</(?P=inner3)>)?\s*(</(?P=inner2)>)?\s*(</(?P=inner1)>)?\s*</(?P=outer)>', '<p style="text-align:center; margin-top:1.25em; margin-bottom:1.25em">' + '\g<break>' + '</p>', html)
|
||||
if not self.blanks_deleted:
|
||||
html = self.multi_blank.sub('\n<p id="softbreak" style="margin-top:1.5em; margin-bottom:1.5em"> </p>', html)
|
||||
html = re.sub('<p\s+id="softbreak"[^>]*>\s*</p>', '<div id="softbreak" style="margin-left: 45%; margin-right: 45%; margin-top:1.5em; margin-bottom:1.5em"><hr style="height: 3px; background:#505050" /></div>', html)
|
||||
|
||||
if self.deleted_nbsps:
|
||||
# put back non-breaking spaces in empty paragraphs to preserve original formatting
|
||||
html = self.blankreg.sub('\n'+r'\g<openline>'+u'\u00a0'+r'\g<closeline>', html)
|
||||
|
||||
return html
|
||||
|
@ -16,7 +16,6 @@ import uuid
|
||||
|
||||
from lxml import etree
|
||||
|
||||
from calibre import guess_type
|
||||
from calibre import prepare_string_for_xml
|
||||
from calibre.constants import __appname__, __version__
|
||||
from calibre.ebooks.oeb.base import XHTML, XHTML_NS, barename, namespace
|
||||
@ -102,6 +101,7 @@ class FB2MLizer(object):
|
||||
metadata['date'] = '%i.%i.%i' % (datetime.now().day, datetime.now().month, datetime.now().year)
|
||||
metadata['lang'] = u''.join(self.oeb_book.metadata.lang) if self.oeb_book.metadata.lang else 'en'
|
||||
metadata['id'] = None
|
||||
metadata['cover'] = self.get_cover()
|
||||
|
||||
author_parts = self.oeb_book.metadata.creator[0].value.split(' ')
|
||||
if len(author_parts) == 1:
|
||||
@ -124,6 +124,7 @@ class FB2MLizer(object):
|
||||
metadata['id'] = str(uuid.uuid4())
|
||||
|
||||
for key, value in metadata.items():
|
||||
if not key == 'cover':
|
||||
metadata[key] = prepare_string_for_xml(value)
|
||||
|
||||
return u'<FictionBook xmlns="http://www.gribuser.ru/xml/fictionbook/2.0" xmlns:xlink="http://www.w3.org/1999/xlink">' \
|
||||
@ -136,6 +137,7 @@ class FB2MLizer(object):
|
||||
'<last-name>%(author_last)s</last-name>' \
|
||||
'</author>' \
|
||||
'<book-title>%(title)s</book-title>' \
|
||||
'%(cover)s' \
|
||||
'<lang>%(lang)s</lang>' \
|
||||
'</title-info>' \
|
||||
'<document-info>' \
|
||||
@ -154,6 +156,39 @@ class FB2MLizer(object):
|
||||
def fb2_footer(self):
|
||||
return u'</FictionBook>'
|
||||
|
||||
def get_cover(self):
|
||||
cover_href = None
|
||||
|
||||
# Get the raster cover if it's available.
|
||||
if self.oeb_book.metadata.cover and unicode(self.oeb_book.metadata.cover[0]) in self.oeb_book.manifest.ids:
|
||||
id = unicode(self.oeb_book.metadata.cover[0])
|
||||
cover_item = self.oeb_book.manifest.ids[id]
|
||||
if cover_item.media_type in OEB_RASTER_IMAGES:
|
||||
cover_href = cover_item.href
|
||||
else:
|
||||
# Figure out if we have a title page or a cover page
|
||||
page_name = ''
|
||||
if 'titlepage' in self.oeb_book.guide:
|
||||
page_name = 'titlepage'
|
||||
elif 'cover' in self.oeb_book.guide:
|
||||
page_name = 'cover'
|
||||
|
||||
if page_name:
|
||||
cover_item = self.oeb_book.manifest.hrefs[self.oeb_book.guide[page_name].href]
|
||||
# Get the first image in the page
|
||||
for img in cover_item.xpath('//img'):
|
||||
cover_href = cover_item.abshref(img.get('src'))
|
||||
break
|
||||
|
||||
if cover_href:
|
||||
# Only write the image tag if it is in the manifest.
|
||||
if cover_href in self.oeb_book.manifest.hrefs.keys():
|
||||
if cover_href not in self.image_hrefs.keys():
|
||||
self.image_hrefs[cover_href] = '_%s.jpg' % len(self.image_hrefs.keys())
|
||||
return u'<coverpage><image xlink:href="#%s" /></coverpage>' % self.image_hrefs[cover_href]
|
||||
|
||||
return u''
|
||||
|
||||
def get_text(self):
|
||||
text = ['<body>']
|
||||
|
||||
@ -162,23 +197,6 @@ class FB2MLizer(object):
|
||||
text.append('<section>')
|
||||
self.section_level += 1
|
||||
|
||||
# Insert the title page / cover into the spine if it is not already referenced.
|
||||
title_name = u''
|
||||
if 'titlepage' in self.oeb_book.guide:
|
||||
title_name = 'titlepage'
|
||||
elif 'cover' in self.oeb_book.guide:
|
||||
title_name = 'cover'
|
||||
if title_name:
|
||||
title_item = self.oeb_book.manifest.hrefs[self.oeb_book.guide[title_name].href]
|
||||
if title_item.spine_position is None and title_item.media_type == 'application/xhtml+xml':
|
||||
self.oeb_book.spine.insert(0, title_item, True)
|
||||
# Create xhtml page to reference cover image so it can be used.
|
||||
if not title_name and self.oeb_book.metadata.cover and unicode(self.oeb_book.metadata.cover[0]) in self.oeb_book.manifest.ids:
|
||||
id = unicode(self.oeb_book.metadata.cover[0])
|
||||
cover_item = self.oeb_book.manifest.ids[id]
|
||||
if cover_item.media_type in OEB_RASTER_IMAGES:
|
||||
self.insert_image_cover(cover_item.href)
|
||||
|
||||
for item in self.oeb_book.spine:
|
||||
self.log.debug('Converting %s to FictionBook2 XML' % item.href)
|
||||
stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts, self.opts.output_profile)
|
||||
@ -203,17 +221,6 @@ class FB2MLizer(object):
|
||||
|
||||
return ''.join(text) + '</body>'
|
||||
|
||||
def insert_image_cover(self, image_href):
|
||||
from calibre.ebooks.oeb.base import RECOVER_PARSER
|
||||
try:
|
||||
root = etree.fromstring(u'<html xmlns="%s"><body><img src="%s" /></body></html>' % (XHTML_NS, image_href), parser=RECOVER_PARSER)
|
||||
except:
|
||||
root = etree.fromstring(u'', parser=RECOVER_PARSER)
|
||||
|
||||
id, href = self.oeb_book.manifest.generate('fb2_cover', 'fb2_cover.xhtml')
|
||||
item = self.oeb_book.manifest.add(id, href, guess_type(href)[0], data=root)
|
||||
self.oeb_book.spine.insert(0, item, True)
|
||||
|
||||
def fb2mlize_images(self):
|
||||
'''
|
||||
This function uses the self.image_hrefs dictionary mapping. It is populated by the dump_text function.
|
||||
|
@ -46,15 +46,19 @@ class FB2Input(InputFormatPlugin):
|
||||
log.debug('Parsing XML...')
|
||||
raw = stream.read().replace('\0', '')
|
||||
raw = xml_to_unicode(raw, strip_encoding_pats=True,
|
||||
assume_utf8=True)[0]
|
||||
assume_utf8=True, resolve_entities=True)[0]
|
||||
try:
|
||||
doc = etree.fromstring(raw)
|
||||
except etree.XMLSyntaxError:
|
||||
try:
|
||||
doc = etree.fromstring(raw, parser=RECOVER_PARSER)
|
||||
if doc is None:
|
||||
raise Exception('parse failed')
|
||||
except:
|
||||
doc = etree.fromstring(raw.replace('& ', '&'),
|
||||
parser=RECOVER_PARSER)
|
||||
if doc is None:
|
||||
raise ValueError('The FB2 file is not valid XML')
|
||||
stylesheets = doc.xpath('//*[local-name() = "stylesheet" and @type="text/css"]')
|
||||
css = ''
|
||||
for s in stylesheets:
|
||||
@ -100,7 +104,11 @@ class FB2Input(InputFormatPlugin):
|
||||
entries = [(f, guess_type(f)[0]) for f in os.listdir('.')]
|
||||
opf.create_manifest(entries)
|
||||
opf.create_spine(['index.xhtml'])
|
||||
|
||||
if mi.cover_data and mi.cover_data[1]:
|
||||
with open('fb2_cover_calibre_mi.jpg', 'wb') as f:
|
||||
f.write(mi.cover_data[1])
|
||||
opf.guide.set_cover(os.path.abspath('fb2_cover_calibre_mi.jpg'))
|
||||
else:
|
||||
for img in doc.xpath('//f:coverpage/f:image', namespaces=NAMESPACES):
|
||||
href = img.get('{%s}href'%XLINK_NS, img.get('href', None))
|
||||
if href is not None:
|
||||
|
@ -21,10 +21,9 @@ from calibre.customize.conversion import InputFormatPlugin
|
||||
from calibre.ebooks.chardet import xml_to_unicode
|
||||
from calibre.customize.conversion import OptionRecommendation
|
||||
from calibre.constants import islinux, isfreebsd, iswindows
|
||||
from calibre import unicode_path
|
||||
from calibre import unicode_path, as_unicode
|
||||
from calibre.utils.localization import get_lang
|
||||
from calibre.utils.filenames import ascii_filename
|
||||
from calibre.ebooks.conversion.utils import PreProcessor
|
||||
|
||||
class Link(object):
|
||||
'''
|
||||
@ -112,14 +111,14 @@ class HTMLFile(object):
|
||||
with open(self.path, 'rb') as f:
|
||||
src = f.read()
|
||||
except IOError, err:
|
||||
msg = 'Could not read from file: %s with error: %s'%(self.path, unicode(err))
|
||||
msg = 'Could not read from file: %s with error: %s'%(self.path, as_unicode(err))
|
||||
if level == 0:
|
||||
raise IOError(msg)
|
||||
raise IgnoreFile(msg, err.errno)
|
||||
|
||||
self.is_binary = level > 0 and not bool(self.HTML_PAT.search(src[:4096]))
|
||||
if not self.is_binary:
|
||||
if encoding is None:
|
||||
if not encoding:
|
||||
encoding = xml_to_unicode(src[:4096], verbose=verbose)[-1]
|
||||
self.encoding = encoding
|
||||
else:
|
||||
@ -296,7 +295,7 @@ class HTMLInput(InputFormatPlugin):
|
||||
return oeb
|
||||
|
||||
from calibre.ebooks.conversion.plumber import create_oebbook
|
||||
return create_oebbook(log, stream.name, opts, self,
|
||||
return create_oebbook(log, stream.name, opts,
|
||||
encoding=opts.input_encoding)
|
||||
|
||||
def is_case_sensitive(self, path):
|
||||
@ -485,9 +484,3 @@ class HTMLInput(InputFormatPlugin):
|
||||
self.log.exception('Failed to read CSS file: %r'%link)
|
||||
return (None, None)
|
||||
return (None, raw)
|
||||
|
||||
def preprocess_html(self, options, html):
|
||||
self.options = options
|
||||
preprocessor = PreProcessor(self.options, log=getattr(self, 'log', None))
|
||||
return preprocessor(html)
|
||||
|
||||
|
@ -7,8 +7,6 @@ __copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
from calibre.customize.conversion import InputFormatPlugin
|
||||
from calibre.ebooks.conversion.utils import PreProcessor
|
||||
|
||||
|
||||
class LITInput(InputFormatPlugin):
|
||||
|
||||
@ -22,7 +20,7 @@ class LITInput(InputFormatPlugin):
|
||||
from calibre.ebooks.lit.reader import LitReader
|
||||
from calibre.ebooks.conversion.plumber import create_oebbook
|
||||
self.log = log
|
||||
return create_oebbook(log, stream, options, self, reader=LitReader)
|
||||
return create_oebbook(log, stream, options, reader=LitReader)
|
||||
|
||||
def postprocess_book(self, oeb, opts, log):
|
||||
from calibre.ebooks.oeb.base import XHTML_NS, XPath, XHTML
|
||||
@ -39,10 +37,13 @@ class LITInput(InputFormatPlugin):
|
||||
body = body[0]
|
||||
if len(body) == 1 and body[0].tag == XHTML('pre'):
|
||||
pre = body[0]
|
||||
from calibre.ebooks.txt.processor import convert_basic
|
||||
from calibre.ebooks.txt.processor import convert_basic, preserve_spaces, \
|
||||
separate_paragraphs_single_line
|
||||
from lxml import etree
|
||||
import copy
|
||||
html = convert_basic(pre.text).replace('<html>',
|
||||
html = separate_paragraphs_single_line(pre.text)
|
||||
html = preserve_spaces(html)
|
||||
html = convert_basic(html).replace('<html>',
|
||||
'<html xmlns="%s">'%XHTML_NS)
|
||||
root = etree.fromstring(html)
|
||||
body = XPath('//h:body')(root)
|
||||
@ -51,10 +52,3 @@ class LITInput(InputFormatPlugin):
|
||||
for elem in body:
|
||||
ne = copy.deepcopy(elem)
|
||||
pre.append(ne)
|
||||
|
||||
|
||||
def preprocess_html(self, options, html):
|
||||
self.options = options
|
||||
preprocessor = PreProcessor(self.options, log=getattr(self, 'log', None))
|
||||
return preprocessor(html)
|
||||
|
||||
|
@ -12,7 +12,6 @@ from copy import deepcopy
|
||||
from lxml import etree
|
||||
|
||||
from calibre.customize.conversion import InputFormatPlugin
|
||||
from calibre.ebooks.conversion.utils import PreProcessor
|
||||
from calibre import guess_type
|
||||
|
||||
class Canvas(etree.XSLTExtension):
|
||||
@ -419,11 +418,3 @@ class LRFInput(InputFormatPlugin):
|
||||
f.write(result)
|
||||
styles.write()
|
||||
return os.path.abspath('content.opf')
|
||||
|
||||
def preprocess_html(self, options, html):
|
||||
self.options = options
|
||||
preprocessor = PreProcessor(self.options, log=getattr(self, 'log', None))
|
||||
return preprocessor(html)
|
||||
|
||||
|
||||
|
||||
|
@ -324,14 +324,16 @@ class Metadata(object):
|
||||
if metadata is None:
|
||||
traceback.print_stack()
|
||||
return
|
||||
metadata = copy.deepcopy(metadata)
|
||||
if '#value#' not in metadata:
|
||||
if metadata['datatype'] == 'text' and metadata['is_multiple']:
|
||||
metadata['#value#'] = []
|
||||
m = {}
|
||||
for k in metadata:
|
||||
m[k] = copy.copy(metadata[k])
|
||||
if '#value#' not in m:
|
||||
if m['datatype'] == 'text' and m['is_multiple']:
|
||||
m['#value#'] = []
|
||||
else:
|
||||
metadata['#value#'] = None
|
||||
m['#value#'] = None
|
||||
_data = object.__getattribute__(self, '_data')
|
||||
_data['user_metadata'][field] = metadata
|
||||
_data['user_metadata'][field] = m
|
||||
|
||||
def template_to_attribute(self, other, ops):
|
||||
'''
|
||||
|
@ -4,7 +4,7 @@ __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
Fetch cover from LibraryThing.com based on ISBN number.
|
||||
'''
|
||||
|
||||
import sys, socket, os, re
|
||||
import sys, socket, os, re, random
|
||||
|
||||
from lxml import html
|
||||
import mechanize
|
||||
@ -16,13 +16,26 @@ from calibre.ebooks.chardet import strip_encoding_declarations
|
||||
|
||||
OPENLIBRARY = 'http://covers.openlibrary.org/b/isbn/%s-L.jpg?default=false'
|
||||
|
||||
def get_ua():
|
||||
choices = [
|
||||
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.9.2.11) Gecko/20101012 Firefox/3.6.11'
|
||||
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)'
|
||||
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)'
|
||||
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1)'
|
||||
'Mozilla/5.0 (iPhone; U; CPU iPhone OS 3_0 like Mac OS X; en-us) AppleWebKit/528.18 (KHTML, like Gecko) Version/4.0 Mobile/7A341 Safari/528.16'
|
||||
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/0.2.153.1 Safari/525.19'
|
||||
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.9.2.11) Gecko/20101012 Firefox/3.6.11'
|
||||
]
|
||||
return choices[random.randint(0, len(choices)-1)]
|
||||
|
||||
|
||||
class HeadRequest(mechanize.Request):
|
||||
|
||||
def get_method(self):
|
||||
return 'HEAD'
|
||||
|
||||
def check_for_cover(isbn, timeout=5.):
|
||||
br = browser()
|
||||
br = browser(user_agent=get_ua())
|
||||
br.set_handle_redirect(False)
|
||||
try:
|
||||
br.open_novisit(HeadRequest(OPENLIBRARY%isbn), timeout=timeout)
|
||||
@ -51,7 +64,7 @@ def login(br, username, password, force=True):
|
||||
|
||||
def cover_from_isbn(isbn, timeout=5., username=None, password=None):
|
||||
src = None
|
||||
br = browser()
|
||||
br = browser(user_agent=get_ua())
|
||||
try:
|
||||
return br.open(OPENLIBRARY%isbn, timeout=timeout).read(), 'jpg'
|
||||
except:
|
||||
@ -100,7 +113,7 @@ def get_social_metadata(title, authors, publisher, isbn, username=None,
|
||||
from calibre.ebooks.metadata import MetaInformation
|
||||
mi = MetaInformation(title, authors)
|
||||
if isbn:
|
||||
br = browser()
|
||||
br = browser(user_agent=get_ua())
|
||||
if username and password:
|
||||
try:
|
||||
login(br, username, password, force=False)
|
||||
|
@ -10,7 +10,8 @@ from calibre.ebooks.metadata import MetaInformation, string_to_authors
|
||||
title_pat = re.compile(r'\{\\info.*?\{\\title(.*?)(?<!\\)\}', re.DOTALL)
|
||||
author_pat = re.compile(r'\{\\info.*?\{\\author(.*?)(?<!\\)\}', re.DOTALL)
|
||||
comment_pat = re.compile(r'\{\\info.*?\{\\subject(.*?)(?<!\\)\}', re.DOTALL)
|
||||
category_pat = re.compile(r'\{\\info.*?\{\\category(.*?)(?<!\\)\}', re.DOTALL)
|
||||
tags_pat = re.compile(r'\{\\info.*?\{\\category(.*?)(?<!\\)\}', re.DOTALL)
|
||||
publisher_pat = re.compile(r'\{\\info.*?\{\\manager(.*?)(?<!\\)\}', re.DOTALL)
|
||||
|
||||
def get_document_info(stream):
|
||||
"""
|
||||
@ -82,61 +83,73 @@ def decode(raw, codec):
|
||||
|
||||
def get_metadata(stream):
|
||||
""" Return metadata as a L{MetaInfo} object """
|
||||
title, author, comment, category = None, None, None, None
|
||||
stream.seek(0)
|
||||
if stream.read(5) != r'{\rtf':
|
||||
return MetaInformation(None, None)
|
||||
return MetaInformation(_('Unknown'))
|
||||
block = get_document_info(stream)[0]
|
||||
if not block:
|
||||
return MetaInformation(None, None)
|
||||
return MetaInformation(_('Unknown'))
|
||||
|
||||
stream.seek(0)
|
||||
cpg = detect_codepage(stream)
|
||||
stream.seek(0)
|
||||
|
||||
title_match = title_pat.search(block)
|
||||
if title_match:
|
||||
if title_match is not None:
|
||||
title = decode(title_match.group(1).strip(), cpg)
|
||||
else:
|
||||
title = _('Unknown')
|
||||
author_match = author_pat.search(block)
|
||||
if author_match:
|
||||
if author_match is not None:
|
||||
author = decode(author_match.group(1).strip(), cpg)
|
||||
comment_match = comment_pat.search(block)
|
||||
if comment_match:
|
||||
comment = decode(comment_match.group(1).strip(), cpg)
|
||||
category_match = category_pat.search(block)
|
||||
if category_match:
|
||||
category = decode(category_match.group(1).strip(), cpg)
|
||||
mi = MetaInformation(title, author)
|
||||
else:
|
||||
author = None
|
||||
mi = MetaInformation(title)
|
||||
if author:
|
||||
mi.authors = string_to_authors(author)
|
||||
|
||||
comment_match = comment_pat.search(block)
|
||||
if comment_match is not None:
|
||||
comment = decode(comment_match.group(1).strip(), cpg)
|
||||
mi.comments = comment
|
||||
mi.category = category
|
||||
tags_match = tags_pat.search(block)
|
||||
if tags_match is not None:
|
||||
tags = decode(tags_match.group(1).strip(), cpg)
|
||||
mi.tags = tags
|
||||
publisher_match = publisher_pat.search(block)
|
||||
if publisher_match is not None:
|
||||
publisher = decode(publisher_match.group(1).strip(), cpg)
|
||||
mi.publisher = publisher
|
||||
|
||||
return mi
|
||||
|
||||
|
||||
def create_metadata(stream, options):
|
||||
md = r'{\info'
|
||||
md = [r'{\info']
|
||||
if options.title:
|
||||
title = options.title.encode('ascii', 'ignore')
|
||||
md += r'{\title %s}'%(title,)
|
||||
md.append(r'{\title %s}'%(title,))
|
||||
if options.authors:
|
||||
au = options.authors
|
||||
if not isinstance(au, basestring):
|
||||
au = u', '.join(au)
|
||||
author = au.encode('ascii', 'ignore')
|
||||
md += r'{\author %s}'%(author,)
|
||||
if options.get('category', None):
|
||||
category = options.category.encode('ascii', 'ignore')
|
||||
md += r'{\category %s}'%(category,)
|
||||
md.append(r'{\author %s}'%(author,))
|
||||
comp = options.comment if hasattr(options, 'comment') else options.comments
|
||||
if comp:
|
||||
comment = comp.encode('ascii', 'ignore')
|
||||
md += r'{\subject %s}'%(comment,)
|
||||
if len(md) > 6:
|
||||
md += '}'
|
||||
md.append(r'{\subject %s}'%(comment,))
|
||||
if options.publisher:
|
||||
publisher = options.publisher.encode('ascii', 'ignore')
|
||||
md.append(r'{\manager %s}'%(publisher,))
|
||||
if options.tags:
|
||||
tags = u', '.join(options.tags)
|
||||
tags = tags.encode('ascii', 'ignore')
|
||||
md.append(r'{\category %s}'%(tags,))
|
||||
if len(md) > 1:
|
||||
md.append('}')
|
||||
stream.seek(0)
|
||||
src = stream.read()
|
||||
ans = src[:6] + md + src[6:]
|
||||
ans = src[:6] + u''.join(md) + src[6:]
|
||||
stream.seek(0)
|
||||
stream.write(ans)
|
||||
|
||||
@ -156,7 +169,7 @@ def set_metadata(stream, options):
|
||||
|
||||
base_pat = r'\{\\name(.*?)(?<!\\)\}'
|
||||
title = options.title
|
||||
if title != None:
|
||||
if title is not None:
|
||||
title = title.encode('ascii', 'replace')
|
||||
pat = re.compile(base_pat.replace('name', 'title'), re.DOTALL)
|
||||
if pat.search(src):
|
||||
@ -164,7 +177,7 @@ def set_metadata(stream, options):
|
||||
else:
|
||||
src = add_metadata_item(src, 'title', title)
|
||||
comment = options.comments
|
||||
if comment != None:
|
||||
if comment is not None:
|
||||
comment = comment.encode('ascii', 'replace')
|
||||
pat = re.compile(base_pat.replace('name', 'subject'), re.DOTALL)
|
||||
if pat.search(src):
|
||||
@ -172,7 +185,7 @@ def set_metadata(stream, options):
|
||||
else:
|
||||
src = add_metadata_item(src, 'subject', comment)
|
||||
author = options.authors
|
||||
if author != None:
|
||||
if author is not None:
|
||||
author = ', '.join(author)
|
||||
author = author.encode('ascii', 'ignore')
|
||||
pat = re.compile(base_pat.replace('name', 'author'), re.DOTALL)
|
||||
@ -180,14 +193,23 @@ def set_metadata(stream, options):
|
||||
src = pat.sub(r'{\\author ' + author + r'}', src)
|
||||
else:
|
||||
src = add_metadata_item(src, 'author', author)
|
||||
category = options.get('category', None)
|
||||
if category != None:
|
||||
category = category.encode('ascii', 'replace')
|
||||
tags = options.tags
|
||||
if tags is not None:
|
||||
tags = ', '.join(tags)
|
||||
tags = tags.encode('ascii', 'replace')
|
||||
pat = re.compile(base_pat.replace('name', 'category'), re.DOTALL)
|
||||
if pat.search(src):
|
||||
src = pat.sub(r'{\\category ' + category + r'}', src)
|
||||
src = pat.sub(r'{\\category ' + tags + r'}', src)
|
||||
else:
|
||||
src = add_metadata_item(src, 'category', category)
|
||||
src = add_metadata_item(src, 'category', tags)
|
||||
publisher = options.publisher
|
||||
if publisher is not None:
|
||||
publisher = publisher.encode('ascii', 'replace')
|
||||
pat = re.compile(base_pat.replace('name', 'manager'), re.DOTALL)
|
||||
if pat.search(src):
|
||||
src = pat.sub(r'{\\manager ' + publisher + r'}', src)
|
||||
else:
|
||||
src = add_metadata_item(src, 'manager', publisher)
|
||||
stream.seek(pos + olen)
|
||||
after = stream.read()
|
||||
stream.seek(pos)
|
||||
|
@ -3,7 +3,6 @@ __license__ = 'GPL 3'
|
||||
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import re
|
||||
from calibre.customize.conversion import InputFormatPlugin
|
||||
|
||||
class MOBIInput(InputFormatPlugin):
|
||||
@ -39,11 +38,3 @@ class MOBIInput(InputFormatPlugin):
|
||||
accelerators['pagebreaks'] = '//h:div[@class="mbp_pagebreak"]'
|
||||
return mr.created_opf_path
|
||||
|
||||
def preprocess_html(self, options, html):
|
||||
# search for places where a first or second level heading is immediately followed by another
|
||||
# top level heading. demote the second heading to h3 to prevent splitting between chapter
|
||||
# headings and titles, images, etc
|
||||
doubleheading = re.compile(r'(?P<firsthead><h(1|2)[^>]*>.+?</h(1|2)>\s*(<(?!h\d)[^>]*>\s*)*)<h(1|2)(?P<secondhead>[^>]*>.+?)</h(1|2)>', re.IGNORECASE)
|
||||
html = doubleheading.sub('\g<firsthead>'+'\n<h3'+'\g<secondhead>'+'</h3>', html)
|
||||
return html
|
||||
|
||||
|
@ -139,7 +139,7 @@ class BookHeader(object):
|
||||
65001: 'utf-8',
|
||||
}[self.codepage]
|
||||
except (IndexError, KeyError):
|
||||
self.codec = 'cp1252' if user_encoding is None else user_encoding
|
||||
self.codec = 'cp1252' if not user_encoding else user_encoding
|
||||
log.warn('Unknown codepage %d. Assuming %s' % (self.codepage,
|
||||
self.codec))
|
||||
if ident == 'TEXTREAD' or self.length < 0xE4 or 0xE8 < self.length \
|
||||
@ -541,6 +541,16 @@ class MobiReader(object):
|
||||
pass
|
||||
elif tag.tag == 'img':
|
||||
tag.set('height', height)
|
||||
else:
|
||||
if tag.tag == 'div' and not tag.text and \
|
||||
(not tag.tail or not tag.tail.strip()) and \
|
||||
not len(list(tag.iterdescendants())):
|
||||
# Paragraph spacer
|
||||
# Insert nbsp so that the element is never
|
||||
# discarded by a renderer
|
||||
tag.text = u'\u00a0' # nbsp
|
||||
styles.append('height: %s' %
|
||||
self.ensure_unit(height))
|
||||
else:
|
||||
styles.append('margin-top: %s' % self.ensure_unit(height))
|
||||
if attrib.has_key('width'):
|
||||
@ -632,9 +642,18 @@ class MobiReader(object):
|
||||
attrib['class'] = cls
|
||||
|
||||
for tag in svg_tags:
|
||||
p = tag.getparent()
|
||||
if hasattr(p, 'remove'):
|
||||
p.remove(tag)
|
||||
images = tag.xpath('descendant::img[@src]')
|
||||
parent = tag.getparent()
|
||||
|
||||
if images and hasattr(parent, 'find'):
|
||||
index = parent.index(tag)
|
||||
for img in images:
|
||||
img.getparent().remove(img)
|
||||
img.tail = img.text = None
|
||||
parent.insert(index, img)
|
||||
|
||||
if hasattr(parent, 'remove'):
|
||||
parent.remove(tag)
|
||||
|
||||
def create_opf(self, htmlfile, guide=None, root=None):
|
||||
mi = getattr(self.book_header.exth, 'mi', self.embedded_mi)
|
||||
|
@ -251,7 +251,7 @@ class Serializer(object):
|
||||
tag = prefixname(elem.tag, nsrmap)
|
||||
# Previous layers take care of @name
|
||||
id = elem.attrib.pop('id', None)
|
||||
if id is not None:
|
||||
if id:
|
||||
href = '#'.join((item.href, id))
|
||||
offset = self.anchor_offset or buffer.tell()
|
||||
self.id_offsets[urlnormalize(href)] = offset
|
||||
@ -1541,7 +1541,10 @@ class MobiWriter(object):
|
||||
exth.write(data)
|
||||
nrecs += 1
|
||||
if term == 'rights' :
|
||||
try:
|
||||
rights = unicode(oeb.metadata.rights[0]).encode('utf-8')
|
||||
except:
|
||||
rights = 'Unknown'
|
||||
exth.write(pack('>II', EXTH_CODES['rights'], len(rights) + 8))
|
||||
exth.write(rights)
|
||||
|
||||
|
@ -1892,7 +1892,7 @@ class OEBBook(object):
|
||||
return fix_data(data.decode(bom_enc))
|
||||
except UnicodeDecodeError:
|
||||
pass
|
||||
if self.input_encoding is not None:
|
||||
if self.input_encoding:
|
||||
try:
|
||||
return fix_data(data.decode(self.input_encoding, 'replace'))
|
||||
except UnicodeDecodeError:
|
||||
|
@ -199,8 +199,8 @@ class EbookIterator(object):
|
||||
not hasattr(self.pathtoopf, 'manifest'):
|
||||
if hasattr(self.pathtoopf, 'manifest'):
|
||||
self.pathtoopf = write_oebbook(self.pathtoopf, self.base)
|
||||
self.pathtoopf = create_oebbook(self.log, self.pathtoopf, plumber.opts,
|
||||
plumber.input_plugin)
|
||||
self.pathtoopf = create_oebbook(self.log, self.pathtoopf,
|
||||
plumber.opts)
|
||||
|
||||
if hasattr(self.pathtoopf, 'manifest'):
|
||||
self.pathtoopf = write_oebbook(self.pathtoopf, self.base)
|
||||
@ -227,7 +227,7 @@ class EbookIterator(object):
|
||||
self.log.warn('Missing spine item:', repr(spath))
|
||||
|
||||
cover = self.opf.cover
|
||||
if self.ebook_ext in ('lit', 'mobi', 'prc', 'opf') and cover:
|
||||
if self.ebook_ext in ('lit', 'mobi', 'prc', 'opf', 'fb2') and cover:
|
||||
cfile = os.path.join(self.base, 'calibre_iterator_cover.html')
|
||||
chtml = (TITLEPAGE%os.path.relpath(cover, self.base).replace(os.sep,
|
||||
'/')).encode('utf-8')
|
||||
|
@ -10,7 +10,7 @@ import os
|
||||
from calibre.utils.date import isoformat, now
|
||||
from calibre import guess_type
|
||||
|
||||
def meta_info_to_oeb_metadata(mi, m, log):
|
||||
def meta_info_to_oeb_metadata(mi, m, log, override_input_metadata=False):
|
||||
from calibre.ebooks.oeb.base import OPF
|
||||
if not mi.is_null('title'):
|
||||
m.clear('title')
|
||||
@ -29,15 +29,23 @@ def meta_info_to_oeb_metadata(mi, m, log):
|
||||
if not mi.is_null('book_producer'):
|
||||
m.filter('contributor', lambda x : x.role.lower() == 'bkp')
|
||||
m.add('contributor', mi.book_producer, role='bkp')
|
||||
elif override_input_metadata:
|
||||
m.filter('contributor', lambda x : x.role.lower() == 'bkp')
|
||||
if not mi.is_null('comments'):
|
||||
m.clear('description')
|
||||
m.add('description', mi.comments)
|
||||
elif override_input_metadata:
|
||||
m.clear('description')
|
||||
if not mi.is_null('publisher'):
|
||||
m.clear('publisher')
|
||||
m.add('publisher', mi.publisher)
|
||||
elif override_input_metadata:
|
||||
m.clear('publisher')
|
||||
if not mi.is_null('series'):
|
||||
m.clear('series')
|
||||
m.add('series', mi.series)
|
||||
elif override_input_metadata:
|
||||
m.clear('series')
|
||||
if not mi.is_null('isbn'):
|
||||
has = False
|
||||
for x in m.identifier:
|
||||
@ -46,19 +54,27 @@ def meta_info_to_oeb_metadata(mi, m, log):
|
||||
has = True
|
||||
if not has:
|
||||
m.add('identifier', mi.isbn, scheme='ISBN')
|
||||
elif override_input_metadata:
|
||||
m.filter('identifier', lambda x: x.scheme.lower() == 'isbn')
|
||||
if not mi.is_null('language'):
|
||||
m.clear('language')
|
||||
m.add('language', mi.language)
|
||||
if not mi.is_null('series_index'):
|
||||
m.clear('series_index')
|
||||
m.add('series_index', mi.format_series_index())
|
||||
elif override_input_metadata:
|
||||
m.clear('series_index')
|
||||
if not mi.is_null('rating'):
|
||||
m.clear('rating')
|
||||
m.add('rating', '%.2f'%mi.rating)
|
||||
elif override_input_metadata:
|
||||
m.clear('rating')
|
||||
if not mi.is_null('tags'):
|
||||
m.clear('subject')
|
||||
for t in mi.tags:
|
||||
m.add('subject', t)
|
||||
elif override_input_metadata:
|
||||
m.clear('subject')
|
||||
if not mi.is_null('pubdate'):
|
||||
m.clear('date')
|
||||
m.add('date', isoformat(mi.pubdate))
|
||||
@ -71,6 +87,7 @@ def meta_info_to_oeb_metadata(mi, m, log):
|
||||
if not mi.is_null('publication_type'):
|
||||
m.clear('publication_type')
|
||||
m.add('publication_type', mi.publication_type)
|
||||
|
||||
if not m.timestamp:
|
||||
m.add('timestamp', isoformat(now()))
|
||||
|
||||
@ -78,11 +95,12 @@ def meta_info_to_oeb_metadata(mi, m, log):
|
||||
class MergeMetadata(object):
|
||||
'Merge in user metadata, including cover'
|
||||
|
||||
def __call__(self, oeb, mi, opts):
|
||||
def __call__(self, oeb, mi, opts, override_input_metadata=False):
|
||||
self.oeb, self.log = oeb, oeb.log
|
||||
m = self.oeb.metadata
|
||||
self.log('Merging user specified metadata...')
|
||||
meta_info_to_oeb_metadata(mi, m, oeb.log)
|
||||
meta_info_to_oeb_metadata(mi, m, oeb.log,
|
||||
override_input_metadata=override_input_metadata)
|
||||
cover_id = self.set_cover(mi, opts.prefer_metadata_cover)
|
||||
m.clear('cover')
|
||||
if cover_id is not None:
|
||||
|
@ -9,7 +9,6 @@ import os
|
||||
from calibre.customize.conversion import InputFormatPlugin
|
||||
from calibre.ebooks.pdb.header import PdbHeaderReader
|
||||
from calibre.ebooks.pdb import PDBError, IDENTITY_TO_NAME, get_reader
|
||||
from calibre.ebooks.conversion.utils import PreProcessor
|
||||
|
||||
class PDBInput(InputFormatPlugin):
|
||||
|
||||
@ -32,8 +31,3 @@ class PDBInput(InputFormatPlugin):
|
||||
opf = reader.extract_content(os.getcwd())
|
||||
|
||||
return opf
|
||||
|
||||
def preprocess_html(self, options, html):
|
||||
self.options = options
|
||||
preprocessor = PreProcessor(self.options, log=getattr(self, 'log', None))
|
||||
return preprocessor(html)
|
||||
|
@ -65,9 +65,9 @@ class Reader(FormatReader):
|
||||
from calibre.customize.ui import plugin_for_input_format
|
||||
|
||||
txt_plugin = plugin_for_input_format('txt')
|
||||
for option in txt_plugin.options:
|
||||
if not hasattr(self.options, option.option.name):
|
||||
setattr(self.options, option.name, option.recommended_value)
|
||||
for opt in txt_plugin.options:
|
||||
if not hasattr(self.options, opt.option.name):
|
||||
setattr(self.options, opt.option.name, opt.recommended_value)
|
||||
|
||||
stream.seek(0)
|
||||
return txt_plugin.convert(stream, self.options, 'txt', self.log, {})
|
||||
|
@ -31,9 +31,9 @@ class Reader(FormatReader):
|
||||
from calibre.customize.ui import plugin_for_input_format
|
||||
|
||||
pdf_plugin = plugin_for_input_format('pdf')
|
||||
for option in pdf_plugin.options:
|
||||
if not hasattr(self.options, option.option.name):
|
||||
setattr(self.options, option.name, option.recommended_value)
|
||||
for opt in pdf_plugin.options:
|
||||
if not hasattr(self.options, opt.option.name):
|
||||
setattr(self.options, opt.option.name, opt.recommended_value)
|
||||
|
||||
pdf.seek(0)
|
||||
return pdf_plugin.convert(pdf, self.options, 'pdf', self.log, {})
|
||||
|
@ -83,9 +83,9 @@ class Reader(FormatReader):
|
||||
from calibre.customize.ui import plugin_for_input_format
|
||||
|
||||
txt_plugin = plugin_for_input_format('txt')
|
||||
for option in txt_plugin.options:
|
||||
if not hasattr(self.options, option.option.name):
|
||||
setattr(self.options, option.name, option.recommended_value)
|
||||
for opt in txt_plugin.options:
|
||||
if not hasattr(self.options, opt.option.name):
|
||||
setattr(self.options, opt.option.name, opt.recommended_value)
|
||||
|
||||
stream.seek(0)
|
||||
return txt_plugin.convert(stream, self.options, 'txt', self.log, {})
|
||||
|
@ -34,18 +34,15 @@ class PML_HTMLizer(object):
|
||||
'ra',
|
||||
'c',
|
||||
'r',
|
||||
't',
|
||||
's',
|
||||
'l',
|
||||
'k',
|
||||
'T',
|
||||
'FN',
|
||||
'SB',
|
||||
]
|
||||
|
||||
STATES_VALUE_REQ = [
|
||||
'a',
|
||||
'T',
|
||||
'FN',
|
||||
'SB',
|
||||
]
|
||||
@ -96,8 +93,6 @@ class PML_HTMLizer(object):
|
||||
'Sb': 'sb',
|
||||
'c': 'c',
|
||||
'r': 'r',
|
||||
't': 't',
|
||||
'T': 'T',
|
||||
'i': 'i',
|
||||
'I': 'i',
|
||||
'u': 'u',
|
||||
@ -133,8 +128,6 @@ class PML_HTMLizer(object):
|
||||
DIV_STATES = [
|
||||
'c',
|
||||
'r',
|
||||
't',
|
||||
'T',
|
||||
'FN',
|
||||
'SB',
|
||||
]
|
||||
@ -255,8 +248,6 @@ class PML_HTMLizer(object):
|
||||
|
||||
for key, val in self.state.items():
|
||||
if val[0]:
|
||||
if key == 'T':
|
||||
self.state['T'][0] = False
|
||||
if key in self.DIV_STATES:
|
||||
div.append(key)
|
||||
elif key in self.SPAN_STATES:
|
||||
@ -506,6 +497,9 @@ class PML_HTMLizer(object):
|
||||
self.toc = TOC()
|
||||
self.file_name = file_name
|
||||
|
||||
indent_state = {'t': False, 'T': False}
|
||||
adv_indent_val = ''
|
||||
|
||||
for s in self.STATES:
|
||||
self.state[s] = [False, ''];
|
||||
|
||||
@ -515,6 +509,8 @@ class PML_HTMLizer(object):
|
||||
|
||||
parsed = []
|
||||
empty = True
|
||||
basic_indent = indent_state['t']
|
||||
adv_indent = indent_state['T']
|
||||
|
||||
# Must use StringIO, cStringIO does not support unicode
|
||||
line = StringIO.StringIO(line)
|
||||
@ -527,7 +523,7 @@ class PML_HTMLizer(object):
|
||||
if c == '\\':
|
||||
c = line.read(1)
|
||||
|
||||
if c in 'qcrtTiIuobBlk':
|
||||
if c in 'qcriIuobBlk':
|
||||
text = self.process_code(c, line)
|
||||
elif c in 'FS':
|
||||
l = line.read(1)
|
||||
@ -574,6 +570,15 @@ class PML_HTMLizer(object):
|
||||
elif c == 'w':
|
||||
empty = False
|
||||
text = '<hr width="%s" />' % self.code_value(line)
|
||||
elif c == 't':
|
||||
indent_state[c] = not indent_state[c]
|
||||
if indent_state[c]:
|
||||
basic_indent = True
|
||||
elif c == 'T':
|
||||
indent_state[c] = not indent_state[c]
|
||||
if indent_state[c]:
|
||||
adv_indent = True
|
||||
adv_indent_val = self.code_value(line)
|
||||
elif c == '-':
|
||||
empty = False
|
||||
text = '­'
|
||||
@ -590,6 +595,16 @@ class PML_HTMLizer(object):
|
||||
if not empty:
|
||||
text = self.end_line()
|
||||
parsed.append(text)
|
||||
|
||||
if basic_indent:
|
||||
parsed.insert(0, self.STATES_TAGS['t'][0])
|
||||
parsed.append(self.STATES_TAGS['t'][1])
|
||||
elif adv_indent:
|
||||
parsed.insert(0, self.STATES_TAGS['T'][0] % adv_indent_val)
|
||||
parsed.append(self.STATES_TAGS['T'][1])
|
||||
indent_state['T'] = False
|
||||
adv_indent_val = ''
|
||||
|
||||
output.append(u''.join(parsed))
|
||||
line.close()
|
||||
|
||||
|
@ -7,7 +7,6 @@ import os, glob, re, textwrap
|
||||
from lxml import etree
|
||||
|
||||
from calibre.customize.conversion import InputFormatPlugin
|
||||
from calibre.ebooks.conversion.utils import PreProcessor
|
||||
|
||||
border_style_map = {
|
||||
'single' : 'solid',
|
||||
@ -77,7 +76,15 @@ class RTFInput(InputFormatPlugin):
|
||||
|
||||
def generate_xml(self, stream):
|
||||
from calibre.ebooks.rtf2xml.ParseRtf import ParseRtf
|
||||
ofile = 'out.xml'
|
||||
ofile = 'dataxml.xml'
|
||||
run_lev, debug_dir = 1, None
|
||||
if getattr(self.opts, 'debug_pipeline', None) is not None:
|
||||
try:
|
||||
os.mkdir(debug_dir)
|
||||
debug_dir = 'rtfdebug'
|
||||
run_lev = 4
|
||||
except:
|
||||
pass
|
||||
parser = ParseRtf(
|
||||
in_file = stream,
|
||||
out_file = ofile,
|
||||
@ -115,43 +122,45 @@ class RTFInput(InputFormatPlugin):
|
||||
|
||||
# Write or do not write paragraphs. Default is 0.
|
||||
empty_paragraphs = 1,
|
||||
|
||||
#debug
|
||||
deb_dir = debug_dir,
|
||||
run_level = run_lev,
|
||||
)
|
||||
parser.parse_rtf()
|
||||
ans = open('out.xml').read()
|
||||
os.remove('out.xml')
|
||||
return ans
|
||||
with open(ofile, 'rb') as f:
|
||||
return f.read()
|
||||
|
||||
def extract_images(self, picts):
|
||||
import imghdr
|
||||
self.log('Extracting images...')
|
||||
|
||||
with open(picts, 'rb') as f:
|
||||
raw = f.read()
|
||||
picts = filter(len, re.findall(r'\{\\pict([^}]+)\}', raw))
|
||||
hex = re.compile(r'[^a-fA-F0-9]')
|
||||
encs = [hex.sub('', pict) for pict in picts]
|
||||
|
||||
count = 0
|
||||
raw = open(picts, 'rb').read()
|
||||
starts = []
|
||||
for match in re.finditer(r'\{\\pict([^}]+)\}', raw):
|
||||
starts.append(match.start(1))
|
||||
|
||||
imap = {}
|
||||
|
||||
for start in starts:
|
||||
pos, bc = start, 1
|
||||
while bc > 0:
|
||||
if raw[pos] == '}': bc -= 1
|
||||
elif raw[pos] == '{': bc += 1
|
||||
pos += 1
|
||||
pict = raw[start:pos+1]
|
||||
enc = re.sub(r'[^a-zA-Z0-9]', '', pict)
|
||||
for enc in encs:
|
||||
if len(enc) % 2 == 1:
|
||||
enc = enc[:-1]
|
||||
data = enc.decode('hex')
|
||||
fmt = imghdr.what(None, data)
|
||||
if fmt is None:
|
||||
fmt = 'wmf'
|
||||
count += 1
|
||||
name = (('%4d'%count).replace(' ', '0'))+'.wmf'
|
||||
open(name, 'wb').write(data)
|
||||
name = '%04d.%s' % (count, fmt)
|
||||
with open(name, 'wb') as f:
|
||||
f.write(data)
|
||||
imap[count] = name
|
||||
#open(name+'.hex', 'wb').write(enc)
|
||||
return self.convert_images(imap)
|
||||
|
||||
def convert_images(self, imap):
|
||||
for count, val in imap.items():
|
||||
self.default_img = None
|
||||
for count, val in imap.iteritems():
|
||||
try:
|
||||
imap[count] = self.convert_image(val)
|
||||
except:
|
||||
@ -159,11 +168,34 @@ class RTFInput(InputFormatPlugin):
|
||||
return imap
|
||||
|
||||
def convert_image(self, name):
|
||||
from calibre.utils.magick import Image
|
||||
img = Image()
|
||||
img.open(name)
|
||||
if not name.endswith('.wmf'):
|
||||
return name
|
||||
try:
|
||||
return self.rasterize_wmf(name)
|
||||
except:
|
||||
self.log.exception('Failed to convert WMF image %r'%name)
|
||||
return self.replace_wmf(name)
|
||||
|
||||
def replace_wmf(self, name):
|
||||
from calibre.ebooks import calibre_cover
|
||||
if self.default_img is None:
|
||||
self.default_img = calibre_cover('Conversion of WMF images is not supported',
|
||||
'Use Microsoft Word or OpenOffice to save this RTF file'
|
||||
' as HTML and convert that in calibre.', title_size=36,
|
||||
author_size=20)
|
||||
name = name.replace('.wmf', '.jpg')
|
||||
img.save(name)
|
||||
with open(name, 'wb') as f:
|
||||
f.write(self.default_img)
|
||||
return name
|
||||
|
||||
def rasterize_wmf(self, name):
|
||||
from calibre.utils.wmf.parse import wmf_unwrap
|
||||
with open(name, 'rb') as f:
|
||||
data = f.read()
|
||||
data = wmf_unwrap(data)
|
||||
name = name.replace('.wmf', '.png')
|
||||
with open(name, 'wb') as f:
|
||||
f.write(data)
|
||||
return name
|
||||
|
||||
|
||||
@ -192,27 +224,27 @@ class RTFInput(InputFormatPlugin):
|
||||
css += '\n'+'\n'.join(font_size_classes)
|
||||
css += '\n' +'\n'.join(color_classes)
|
||||
|
||||
for cls, val in border_styles.items():
|
||||
for cls, val in border_styles.iteritems():
|
||||
css += '\n\n.%s {\n%s\n}'%(cls, val)
|
||||
|
||||
with open('styles.css', 'ab') as f:
|
||||
f.write(css)
|
||||
|
||||
def preprocess(self, fname):
|
||||
self.log('\tPreprocessing to convert unicode characters')
|
||||
try:
|
||||
data = open(fname, 'rb').read()
|
||||
from calibre.ebooks.rtf.preprocess import RtfTokenizer, RtfTokenParser
|
||||
tokenizer = RtfTokenizer(data)
|
||||
tokens = RtfTokenParser(tokenizer.tokens)
|
||||
data = tokens.toRTF()
|
||||
fname = 'preprocessed.rtf'
|
||||
with open(fname, 'wb') as f:
|
||||
f.write(data)
|
||||
except:
|
||||
self.log.exception(
|
||||
'Failed to preprocess RTF to convert unicode sequences, ignoring...')
|
||||
return fname
|
||||
# def preprocess(self, fname):
|
||||
# self.log('\tPreprocessing to convert unicode characters')
|
||||
# try:
|
||||
# data = open(fname, 'rb').read()
|
||||
# from calibre.ebooks.rtf.preprocess import RtfTokenizer, RtfTokenParser
|
||||
# tokenizer = RtfTokenizer(data)
|
||||
# tokens = RtfTokenParser(tokenizer.tokens)
|
||||
# data = tokens.toRTF()
|
||||
# fname = 'preprocessed.rtf'
|
||||
# with open(fname, 'wb') as f:
|
||||
# f.write(data)
|
||||
# except:
|
||||
# self.log.exception(
|
||||
# 'Failed to preprocess RTF to convert unicode sequences, ignoring...')
|
||||
# return fname
|
||||
|
||||
def convert_borders(self, doc):
|
||||
border_styles = []
|
||||
@ -249,17 +281,13 @@ class RTFInput(InputFormatPlugin):
|
||||
self.log = log
|
||||
self.log('Converting RTF to XML...')
|
||||
#Name of the preprocesssed RTF file
|
||||
fname = self.preprocess(stream.name)
|
||||
# fname = self.preprocess(stream.name)
|
||||
try:
|
||||
xml = self.generate_xml(fname)
|
||||
xml = self.generate_xml(stream.name)
|
||||
except RtfInvalidCodeException, e:
|
||||
raise ValueError(_('This RTF file has a feature calibre does not '
|
||||
'support. Convert it to HTML first and then try it.\n%s')%e)
|
||||
|
||||
'''dataxml = open('dataxml.xml', 'w')
|
||||
dataxml.write(xml)
|
||||
dataxml.close'''
|
||||
|
||||
d = glob.glob(os.path.join('*_rtf_pict_dir', 'picts.rtf'))
|
||||
if d:
|
||||
imap = {}
|
||||
@ -290,13 +318,9 @@ class RTFInput(InputFormatPlugin):
|
||||
res = transform.tostring(result)
|
||||
res = res[:100].replace('xmlns:html', 'xmlns') + res[100:]
|
||||
# Replace newlines inserted by the 'empty_paragraphs' option in rtf2xml with html blank lines
|
||||
if not getattr(self.opts, 'remove_paragraph_spacing', False):
|
||||
res = re.sub('\s*<body>', '<body>', res)
|
||||
res = re.sub('(?<=\n)\n{2}',
|
||||
u'<p>\u00a0</p>\n'.encode('utf-8'), res)
|
||||
if self.opts.preprocess_html:
|
||||
preprocessor = PreProcessor(self.opts, log=getattr(self, 'log', None))
|
||||
res = preprocessor(res)
|
||||
f.write(res)
|
||||
self.write_inline_css(inline_class, border_styles)
|
||||
stream.seek(0)
|
||||
|
@ -18,6 +18,7 @@
|
||||
# $Revision: 1.41 $
|
||||
# $Date: 2006/03/24 23:50:07 $
|
||||
import sys, os
|
||||
|
||||
from calibre.ebooks.rtf2xml import headings_to_sections, \
|
||||
line_endings, footnote, fields_small, default_encoding, \
|
||||
make_lists, preamble_div, header, colors, group_borders, \
|
||||
@ -90,7 +91,6 @@ class ParseRtf:
|
||||
out_file = '',
|
||||
out_dir = None,
|
||||
dtd = '',
|
||||
#debug = 0, #why? calibre
|
||||
deb_dir = None,
|
||||
convert_symbol = None,
|
||||
convert_wingdings = None,
|
||||
@ -107,6 +107,7 @@ class ParseRtf:
|
||||
no_dtd = 0,
|
||||
char_data = '',
|
||||
):
|
||||
|
||||
"""
|
||||
Requires:
|
||||
'file' --file to parse
|
||||
@ -119,12 +120,11 @@ class ParseRtf:
|
||||
script tries to output to directory where is script is exectued.)
|
||||
'deb_dir' --debug directory. If a debug_dir is provided, the script
|
||||
will copy each run through as a file to examine in the debug_dir
|
||||
'perl_script'--use perl to make tokens. This runs just a bit faster.
|
||||
(I will probably phase this out.)
|
||||
'check_brackets' -- make sure the brackets match up after each run
|
||||
through a file. Only for debugging.
|
||||
Returns: Nothing
|
||||
"""
|
||||
|
||||
self.__file = in_file
|
||||
self.__out_file = out_file
|
||||
self.__out_dir = out_dir
|
||||
@ -132,7 +132,7 @@ class ParseRtf:
|
||||
self.__dtd_path = dtd
|
||||
self.__check_file(in_file,"file_to_parse")
|
||||
self.__char_data = char_data
|
||||
self.__debug_dir = deb_dir #self.__debug_dir = debug calibre
|
||||
self.__debug_dir = deb_dir
|
||||
self.__check_dir(self.__temp_dir)
|
||||
self.__copy = self.__check_dir(self.__debug_dir)
|
||||
self.__convert_caps = convert_caps
|
||||
@ -155,25 +155,24 @@ class ParseRtf:
|
||||
if hasattr(the_file, 'read'): return
|
||||
if the_file == None:
|
||||
if type == "file_to_parse":
|
||||
message = "You must provide a file for the script to work"
|
||||
msg = message
|
||||
msg = "\nYou must provide a file for the script to work"
|
||||
raise RtfInvalidCodeException, msg
|
||||
elif os.path.exists(the_file):
|
||||
pass # do nothing
|
||||
else:
|
||||
message = "The file '%s' cannot be found" % the_file
|
||||
msg = message
|
||||
msg = "\nThe file '%s' cannot be found" % the_file
|
||||
raise RtfInvalidCodeException, msg
|
||||
|
||||
def __check_dir(self, the_dir):
|
||||
"""Check to see if directory exists"""
|
||||
if not the_dir :
|
||||
return
|
||||
dir_exists = os.path.isdir(the_dir)
|
||||
if not dir_exists:
|
||||
message = "%s is not a directory" % the_dir
|
||||
msg = message
|
||||
msg = "\n%s is not a directory" % the_dir
|
||||
raise RtfInvalidCodeException, msg
|
||||
return 1
|
||||
|
||||
def parse_rtf(self):
|
||||
"""
|
||||
Parse the file by calling on other classes.
|
||||
@ -194,13 +193,14 @@ class ParseRtf:
|
||||
copy_obj.set_dir(self.__debug_dir)
|
||||
copy_obj.remove_files()
|
||||
copy_obj.copy_file(self.__temp_file, "original_file")
|
||||
# new as of 2005-08-02. Do I want this?
|
||||
# Function to check if bracket are well handled
|
||||
if self.__debug_dir or self.__run_level > 2:
|
||||
self.__check_brack_obj = check_brackets.CheckBrackets\
|
||||
(file = self.__temp_file,
|
||||
bug_handler = RtfInvalidCodeException,
|
||||
)
|
||||
# convert Macintosh line endings to Unix line endings
|
||||
#convert Macintosh and Windows line endings to Unix line endings
|
||||
#why do this if you don't wb after?
|
||||
line_obj = line_endings.FixLineEndings(
|
||||
in_file = self.__temp_file,
|
||||
bug_handler = RtfInvalidCodeException,
|
||||
@ -208,13 +208,13 @@ class ParseRtf:
|
||||
run_level = self.__run_level,
|
||||
replace_illegals = self.__replace_illegals,
|
||||
)
|
||||
return_value = line_obj.fix_endings()
|
||||
return_value = line_obj.fix_endings() #calibre return what?
|
||||
self.__return_code(return_value)
|
||||
tokenize_obj = tokenize.Tokenize(
|
||||
bug_handler = RtfInvalidCodeException,
|
||||
in_file = self.__temp_file,
|
||||
copy = self.__copy,
|
||||
run_level = self.__run_level,)
|
||||
run_level = self.__run_level)
|
||||
tokenize_obj.tokenize()
|
||||
process_tokens_obj = process_tokens.ProcessTokens(
|
||||
in_file = self.__temp_file,
|
||||
@ -226,15 +226,27 @@ class ParseRtf:
|
||||
try:
|
||||
return_value = process_tokens_obj.process_tokens()
|
||||
except InvalidRtfException, msg:
|
||||
#Check to see if the file is correctly encoded
|
||||
encode_obj = default_encoding.DefaultEncoding(
|
||||
in_file = self.__temp_file,
|
||||
run_level = self.__run_level,
|
||||
bug_handler = RtfInvalidCodeException,
|
||||
check_raw = True,
|
||||
)
|
||||
platform, code_page, default_font_num = encode_obj.find_default_encoding()
|
||||
check_encoding_obj = check_encoding.CheckEncoding(
|
||||
bug_handler = RtfInvalidCodeException,
|
||||
)
|
||||
enc = 'cp' + encode_obj.get_codepage()
|
||||
msg = 'Exception in token processing'
|
||||
if check_encoding_obj.check_encoding(self.__file, enc):
|
||||
file_name = self.__file if isinstance(self.__file, str) \
|
||||
else self.__file.encode('utf-8')
|
||||
msg = 'File %s does not appear to be correctly encoded.\n' % file_name
|
||||
try:
|
||||
os.remove(self.__temp_file)
|
||||
except OSError:
|
||||
pass
|
||||
check_encoding_obj = check_encoding.CheckEncoding(
|
||||
bug_handler = RtfInvalidCodeException,
|
||||
)
|
||||
check_encoding_obj.check_encoding(self.__file)
|
||||
sys.stderr.write('File "%s" does not appear to be RTF.\n' % self.__file if isinstance(self.__file, str) else self.__file.encode('utf-8'))
|
||||
raise InvalidRtfException, msg
|
||||
delete_info_obj = delete_info.DeleteInfo(
|
||||
in_file = self.__temp_file,
|
||||
@ -508,6 +520,7 @@ class ParseRtf:
|
||||
indent = self.__indent,
|
||||
run_level = self.__run_level,
|
||||
no_dtd = self.__no_dtd,
|
||||
encoding = encode_obj.get_codepage(),
|
||||
bug_handler = RtfInvalidCodeException,
|
||||
)
|
||||
tags_obj.convert_to_tags()
|
||||
@ -520,6 +533,7 @@ class ParseRtf:
|
||||
output_obj.output()
|
||||
os.remove(self.__temp_file)
|
||||
return self.__exit_level
|
||||
|
||||
def __bracket_match(self, file_name):
|
||||
if self.__run_level > 2:
|
||||
good_br, msg = self.__check_brack_obj.check_brackets()
|
||||
@ -527,28 +541,20 @@ class ParseRtf:
|
||||
pass
|
||||
#sys.stderr.write( msg + ' in ' + file_name + "\n")
|
||||
else:
|
||||
msg += msg + " in file '" + file_name + "'\n"
|
||||
msg = '%s in file %s\n' % (msg, file_name)
|
||||
raise RtfInvalidCodeException, msg
|
||||
|
||||
def __return_code(self, num):
|
||||
if num == None:
|
||||
return
|
||||
if int(num) > self.__exit_level:
|
||||
self.__exit_level = num
|
||||
|
||||
def __make_temp_file(self,file):
|
||||
"""Make a temporary file to parse"""
|
||||
write_file="rtf_write_file"
|
||||
read_obj = file if hasattr(file, 'read') else open(file,'r')
|
||||
write_obj = open(write_file, 'w')
|
||||
line = "dummy"
|
||||
while line:
|
||||
line = read_obj.read(1000)
|
||||
with open(write_file, 'wb') as write_obj:
|
||||
for line in read_obj:
|
||||
write_obj.write(line)
|
||||
write_obj.close()
|
||||
return write_file
|
||||
"""
|
||||
mi<tg<open______<style-sheet\n
|
||||
mi<tg<close_____<style-sheet\n
|
||||
mi<tg<open-att__<footnote<num>1\n
|
||||
mi<tg<empty-att_<page-definition<margin>33\n
|
||||
mi<tg<empty_____<para\n
|
||||
"""
|
||||
|
@ -24,38 +24,38 @@ class CheckBrackets:
|
||||
self.__ob_count = 0
|
||||
self.__cb_count = 0
|
||||
self.__open_bracket_num = []
|
||||
|
||||
def open_brack(self, line):
|
||||
num = line[-5:-1]
|
||||
self.__open_bracket_num.append(num)
|
||||
self.__bracket_count += 1
|
||||
|
||||
def close_brack(self, line):
|
||||
num = line[-5:-1]
|
||||
##self.__open_bracket_num.append(num)
|
||||
try:
|
||||
last_num = self.__open_bracket_num.pop()
|
||||
except:
|
||||
return 0
|
||||
return False
|
||||
if num != last_num:
|
||||
return 0
|
||||
return False
|
||||
self.__bracket_count -= 1
|
||||
return 1
|
||||
return True
|
||||
|
||||
def check_brackets(self):
|
||||
read_obj = open(self.__file, 'r')
|
||||
line = 'dummy'
|
||||
line_count = 0
|
||||
while line:
|
||||
with open(self.__file, 'r') as read_obj:
|
||||
for line in read_obj:
|
||||
line_count += 1
|
||||
line = read_obj.readline()
|
||||
self.__token_info = line[:16]
|
||||
if self.__token_info == 'ob<nu<open-brack':
|
||||
self.open_brack(line)
|
||||
if self.__token_info == 'cb<nu<clos-brack':
|
||||
right_count = self.close_brack(line)
|
||||
if not right_count:
|
||||
return (0, "closed bracket doesn't match, line %s" % line_count)
|
||||
read_obj.close()
|
||||
if not self.close_brack(line):
|
||||
return (False, "closed bracket doesn't match, line %s" % line_count)
|
||||
|
||||
if self.__bracket_count != 0:
|
||||
msg = 'At end of file open and closed brackets don\'t match\n'
|
||||
msg = msg + 'total number of brackets is %s' % self.__bracket_count
|
||||
return (0, msg)
|
||||
return (1, "brackets match!")
|
||||
msg = ('At end of file open and closed brackets don\'t match\n' \
|
||||
'total number of brackets is %s') % self.__bracket_count
|
||||
return (False, msg)
|
||||
return (True, "Brackets match!")
|
||||
|
||||
|
@ -1,8 +1,11 @@
|
||||
#!/usr/bin/env python
|
||||
import sys
|
||||
|
||||
class CheckEncoding:
|
||||
|
||||
def __init__(self, bug_handler):
|
||||
self.__bug_handler = bug_handler
|
||||
|
||||
def __get_position_error(self, line, encoding, line_num):
|
||||
char_position = 0
|
||||
for char in line:
|
||||
@ -12,21 +15,23 @@ class CheckEncoding:
|
||||
except UnicodeError, msg:
|
||||
sys.stderr.write('line: %s char: %s\n' % (line_num, char_position))
|
||||
sys.stderr.write(str(msg) + '\n')
|
||||
def check_encoding(self, path, encoding='us-ascii'):
|
||||
read_obj = open(path, 'r')
|
||||
line_to_read = 1
|
||||
|
||||
def check_encoding(self, path, encoding='us-ascii', verbose=True):
|
||||
line_num = 0
|
||||
while line_to_read:
|
||||
with open(path, 'r') as read_obj:
|
||||
for line in read_obj:
|
||||
line_num += 1
|
||||
line_to_read = read_obj.readline()
|
||||
line = line_to_read
|
||||
try:
|
||||
line.decode(encoding)
|
||||
except UnicodeError:
|
||||
if verbose:
|
||||
if len(line) < 1000:
|
||||
self.__get_position_error(line, encoding, line_num)
|
||||
else:
|
||||
sys.stderr.write('line: %d has bad encoding\n' % line_num)
|
||||
return True
|
||||
return False
|
||||
|
||||
if __name__ == '__main__':
|
||||
check_encoding_obj = CheckEncoding()
|
||||
check_encoding_obj.check_encoding(sys.argv[1])
|
||||
|
@ -16,7 +16,9 @@
|
||||
# #
|
||||
#########################################################################
|
||||
import os, tempfile
|
||||
|
||||
from calibre.ebooks.rtf2xml import copy
|
||||
|
||||
class CombineBorders:
|
||||
"""Combine borders in RTF tokens to make later processing easier"""
|
||||
def __init__(self,
|
||||
@ -32,19 +34,21 @@ class CombineBorders:
|
||||
self.__state = 'default'
|
||||
self.__bord_pos = 'default'
|
||||
self.__bord_att = []
|
||||
|
||||
def found_bd(self, line):
|
||||
#cw<bd<bor-t-r-vi
|
||||
self.__state = 'border'
|
||||
self.__bord_pos = line[6:16]
|
||||
|
||||
def __default_func(self, line):
|
||||
#cw<bd<bor-t-r-vi
|
||||
if self.__first_five == 'cw<bd':
|
||||
self.found_bd(line)
|
||||
return ''
|
||||
return line
|
||||
|
||||
def end_border(self, line, write_obj):
|
||||
joiner = "|"
|
||||
border_string = joiner.join(self.__bord_att)
|
||||
border_string = "|".join(self.__bord_att)
|
||||
self.__bord_att = []
|
||||
write_obj.write('cw<bd<%s<nu<%s\n' % (self.__bord_pos,
|
||||
border_string))
|
||||
@ -54,6 +58,7 @@ class CombineBorders:
|
||||
self. found_bd(line)
|
||||
else:
|
||||
write_obj.write(line)
|
||||
|
||||
def add_to_border_desc(self, line):
|
||||
#cw<bt<bdr-hair__<nu<true
|
||||
#cw<bt<bdr-linew<nu<0.50
|
||||
@ -65,26 +70,22 @@ class CombineBorders:
|
||||
else:
|
||||
num = ':' + num
|
||||
self.__bord_att.append(border_desc + num)
|
||||
|
||||
def __border_func(self, line, write_obj):
|
||||
if self.__first_five != 'cw<bt':
|
||||
self.end_border(line, write_obj)
|
||||
else:
|
||||
self.add_to_border_desc(line)
|
||||
|
||||
def combine_borders(self):
|
||||
read_obj = open(self.__file, 'r')
|
||||
write_obj = open(self.__write_to, 'w')
|
||||
line_to_read = 'dummy'
|
||||
while line_to_read:
|
||||
line_to_read = read_obj.readline()
|
||||
line = line_to_read
|
||||
with open(self.__file, 'r') as read_obj:
|
||||
with open(self.__write_to, 'w') as write_obj:
|
||||
for line in read_obj:
|
||||
self.__first_five = line[0:5]
|
||||
if self.__state == 'border':
|
||||
self.__border_func(line, write_obj)
|
||||
else:
|
||||
to_print = self.__default_func(line)
|
||||
write_obj.write(to_print)
|
||||
read_obj.close()
|
||||
write_obj.close()
|
||||
write_obj.write(self.__default_func(line))
|
||||
copy_obj = copy.Copy(bug_handler = self.__bug_handler)
|
||||
if self.__copy:
|
||||
copy_obj.copy_file(self.__write_to, "combine_borders.data")
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user