mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Merge prior to trunk merge.
This commit is contained in:
commit
4c0865ea71
336
Changelog.yaml
336
Changelog.yaml
@ -4,6 +4,342 @@
|
||||
# for important features/bug fixes.
|
||||
# Also, each release can have new and improved recipes.
|
||||
|
||||
- version: 0.7.40
|
||||
date: 2011-01-14
|
||||
|
||||
new features:
|
||||
- title: "A new 'highlight matches' search mode"
|
||||
description: >
|
||||
"There is now a checkbox next to the search bar named 'Highlight'. If you check it, searching will highlight
|
||||
all matched books instead of filtering the book list to all matched books."
|
||||
|
||||
- title: "RTF Input: Improved support for conversion of images. The bug where some images were shrunk should no longer happen"
|
||||
|
||||
- title: "Template language: Allow you to create your own formatting functions. Accessible via Preferences->Advanced->Template functions"
|
||||
|
||||
- title: "News download: Convert various HTML 5 tags into <div> to support readers that cannot handle HTML 5 tags"
|
||||
|
||||
- title: "RTF metadata: Add support for publisher and tags."
|
||||
tickets: [6657]
|
||||
|
||||
- title: "BibTeX catalog: Add support for custom columns"
|
||||
|
||||
- title: "TXT Input: Support for textile markup"
|
||||
|
||||
- title: "Various minor tweaks to improve usability of Preferences->Plugins"
|
||||
|
||||
- title: "TXT Output: Convert <hr> to scene break marker."
|
||||
|
||||
- title: "Support for the Archos 70"
|
||||
|
||||
- title: "SONY Driver: Add an option to automatically refresh the covers on every connect. Accessible via: Preferences->Plugins->Device interface plugins"
|
||||
|
||||
- title: "Add access to the larger template editor from plugboards via context menu."
|
||||
|
||||
- title: "Speed improvement when connecting a large library to a device"
|
||||
|
||||
- title: "Speedup when searching on multiple words in a large library"
|
||||
|
||||
- title: "TXT Input: Add a heauristic formatting processor"
|
||||
|
||||
|
||||
bug fixes:
|
||||
- title: "Fix bug that caused automatic news removal to remove any book that has a tag that contains the word 'news' instead of only books that have the tag News"
|
||||
|
||||
- title: "Refactor the downloading social metadata message box to allow canceling."
|
||||
tickets: [8234]
|
||||
|
||||
- title: "Kobo drive does not deal with Null value in DateCreated column"
|
||||
tickets: [8308]
|
||||
|
||||
- title: "MOBI Input: Fix regression that caused images placed inside svg tags to be discarded"
|
||||
|
||||
- title: "Fix selecting Tablet output profile would actually select the Samsung Galaxy S profile"
|
||||
|
||||
- title: "Catalog generation: Fix a condition that could cause TOCs to not be properly generated in MOBI format catalogs"
|
||||
tickets: [8295]
|
||||
|
||||
- title: "Zip file reading: Be more tolerant when a zip file has a damaged file directory"
|
||||
|
||||
- title: "RTF Input: Various code cleanups. Go back to trying to handle unicode mappings without pre-processing. This will mean that some RTF files that used to convert, won't anymore. Please open tickets and attach them."
|
||||
tickets: [8171]
|
||||
|
||||
- title: "ImageMagick: When identifying an image don't read the entire image"
|
||||
|
||||
- title: "FB2 Output: Add cover to FB2 metadata."
|
||||
|
||||
- title: "Fix inability to customize builting recipe when more than one recipe has the same name"
|
||||
tickets: [8281]
|
||||
|
||||
- title: "RTF Input: Fix regression that broke the Preprocess HTML option"
|
||||
|
||||
- title: "Fix XSS vulnerability in content server."
|
||||
tickets: [7980]
|
||||
|
||||
- title: "TXT Output: Clean up and produce consistant output. Spacing around headings. Headings are not indented when using the remove paragraph spacing option."
|
||||
|
||||
- title: "Catalog generation: Handle invalid covers gracefully"
|
||||
|
||||
- title: "Email settings: Before displaying the email test dialog warn the user that it will expose their email password"
|
||||
|
||||
- title: "PDB Output: Fix regression that caused some PDB files to not work with other software"
|
||||
tickets: [8231]
|
||||
|
||||
improved recipes:
|
||||
- Financial Times UK
|
||||
- Globe and Mail
|
||||
- Wired Daily
|
||||
- MIT Technology Review
|
||||
- MSNBC
|
||||
- expansion.com
|
||||
- New York Times
|
||||
- Heraldo de Aragon
|
||||
- Exiled online
|
||||
|
||||
new recipes:
|
||||
- title: "Yakima Herald and Tri-City Herald"
|
||||
author: "Laura Gjovaag"
|
||||
|
||||
- title: "Wichita Eagle"
|
||||
author: "Jason Cameron"
|
||||
|
||||
- title: "Pressthink and Zero Hedge"
|
||||
author: "Darko Miletic"
|
||||
|
||||
- title: "tyzden"
|
||||
author: "zemiak"
|
||||
|
||||
- title: "El Correo"
|
||||
author: "desUBIKado"
|
||||
|
||||
- title: "Cicero"
|
||||
author: "mad"
|
||||
|
||||
- title: "El Publico"
|
||||
author: "Gerardo Diez"
|
||||
|
||||
- version: 0.7.38
|
||||
date: 2011-01-07
|
||||
|
||||
new features:
|
||||
- title: "Reduce startup time when using a composite custom column"
|
||||
|
||||
- title: "Template language: Add a list_item function for use with tags like columns. See User Manual for details"
|
||||
|
||||
- title: "TXT Input: Attempt to detect the input encoding when not specified. Auto detect paragraph structure and formatting markup."
|
||||
|
||||
- title: "Search & replace: Add ability to manipulate number and boolean columns."
|
||||
|
||||
- title: "Add type ahead completion to the advanced search dialog."
|
||||
tickets: [8035]
|
||||
|
||||
- title: "Double click on plugin in Preferences dialog to customize"
|
||||
tickets: [8175]
|
||||
|
||||
- title: "Allow customization of the SONY driver to send thumbnail to the device. Useful with newer SONY readers"
|
||||
tickets: [8161]
|
||||
|
||||
- title: "Smarten punctuation: Convert double dashes to em dashes. Preprocessing: Various tweaks"
|
||||
|
||||
bug fixes:
|
||||
- title: "Fix regression causing the template formatter to intepret a missing format letter as ERROR instead of 's'."
|
||||
|
||||
- title: "Fix regression that broke conversion of PNG images in PDF files on OS X."
|
||||
tickets: [8215]
|
||||
|
||||
- title: "Content server: Fix improper XML escaping of category titles in the OPDS feeds"
|
||||
tickets: [8225]
|
||||
|
||||
- title: "When decoding XML if the XML starts with a UTF-8 BOM decode as UTF-8. Fixes parsing of FB2 files with UTF-8 BOMs"
|
||||
|
||||
- title: "E-book viewer: When scrolling to a bookmark and the content is wider than the window, do not scroll in the horizontal direction"
|
||||
|
||||
- title: "E-book viewer: Fix next page skipping the bottom of chapters when the content is wider than the window."
|
||||
tickets: [8153]
|
||||
|
||||
- title: " FB2 Output: Insert covers."
|
||||
tickets: [8172]
|
||||
|
||||
- title: "Content server: When serving OPDS feeds handle html descriptions that have namespaced attributes."
|
||||
tickets: [7938]
|
||||
|
||||
- title: "When downloading metadata from isbndb.com, download a maximum of 30 results rather than 1000"
|
||||
|
||||
- title: "Fix sorting of tags column"
|
||||
|
||||
- title: "Change search/replace to show commas instead of vertical bars as the separator for multiple authors"
|
||||
|
||||
- title: "Template language: Make all column names case insensitive"
|
||||
|
||||
- title: "Fix bug that prevent the Disabled option for Tag Browser partiotining from working in the Preferences dialog"
|
||||
|
||||
- title: "Fix bug when using tags like custom column in the template language"
|
||||
|
||||
- title: "Fix bug where composite custom columns using general_program_mode fields are not evaluated correctly when used in a template."
|
||||
|
||||
- title: "ImageMagick interface: Don't crash when asked to open empty image files"
|
||||
|
||||
- title: "Kobo driver: Add TXT,CBZ,CBR to supported formats list"
|
||||
tickets: [8124]
|
||||
|
||||
- title: "Don't uneccessarily scroll the book list horizontally when re-selcting previously selected rows."
|
||||
|
||||
new recipes:
|
||||
- title: "New London Day"
|
||||
author: "Being"
|
||||
|
||||
- title: "Walla"
|
||||
author: "marbs"
|
||||
|
||||
- title: "New Journal of Physics"
|
||||
author: "Chema Cortes"
|
||||
|
||||
- title: "The Baltimore Sun"
|
||||
author: "Josh Hall"
|
||||
|
||||
- title: "Arabian Business and Sunday Times (UK)"
|
||||
author: "Darko Miletic"
|
||||
|
||||
- title: "Deia"
|
||||
author: "Gerardo Diez"
|
||||
|
||||
- title: "Smarter Planet"
|
||||
author: "Jack Mason"
|
||||
|
||||
|
||||
improved recipes:
|
||||
- The Atlantic
|
||||
- Danas
|
||||
- Ledevoir
|
||||
|
||||
- version: 0.7.37
|
||||
date: 2011-01-02
|
||||
|
||||
new features:
|
||||
- title: "This realease is mostly a bug fix release to fix various things that got broken by all the changes in 0.7.36"
|
||||
|
||||
- title: "Tag browser: Move the configuration of the sub-category grouping from tweaks to the Preferences dialog"
|
||||
|
||||
- title: "Tag browser: Allow changing the sub-categorization scheme from the right click menu"
|
||||
|
||||
bug fixes:
|
||||
- title: "Fix regression in 0.7.36 that caused the Tag Browser to break if you have items in it with empty sort values"
|
||||
|
||||
- title: "Catalog generation: Fix various regressions introduced in 0.7.36 on windows"
|
||||
description: >
|
||||
"Database integrity check not working after catalog generation. Catalog generation failing with a file in use error. Spurious question marks appearing in the catalog"
|
||||
|
||||
- title: "Catalog generation: Work on a copy of the library database so as not to lock it"
|
||||
|
||||
- title: "Catalog generation: Handle merge of comments + custom field when custom filed is None"
|
||||
|
||||
- title: "Fix regression that broke sort_columns_at_startup tweak in 0.7.36"
|
||||
|
||||
- title: "Tag Browser: Fix the Manage X items in the right click menu, which broke in 0.7.36"
|
||||
|
||||
- title: "Tag Browser: Fix grouping by name for authors"
|
||||
|
||||
- title: "Nook color: Fix main memory and SD card swapped in calibre"
|
||||
tickets: [8159]
|
||||
|
||||
- title: "Fix regression in 0.7.36 that broke PDF Output when specifying a cover"
|
||||
|
||||
- title: "Catalog generation: Fix regression in MOBI catalog that caused it to not appear as periodical on Kindle"
|
||||
|
||||
- title: "Fix regression in 0.7.36 that broke opening the book details dialog by double clicking on the book details panel"
|
||||
|
||||
|
||||
- version: 0.7.36
|
||||
date: 2011-01-01
|
||||
|
||||
new features:
|
||||
- title: "Tag browser: Add subcategories and search"
|
||||
description: "When a category has many items, it will be automatically split up. Also add a search to quickly find an item in the Tag Browser. The sub categories can be controlled via preferences->Tweaks. Also add a button to collapse all categories"
|
||||
type: major
|
||||
|
||||
- title: "Device drivers for the Google Nexus S, Motorola Backflip, Samsung Galaxy Tablet, PocketBook 603/903, EEEReader DR900 and the NextBook"
|
||||
|
||||
- title: "Tag editor dialog now remebers its last used size"
|
||||
tickets: [8063]
|
||||
|
||||
- title: "OS X dmg: Add a symlink pointing to the Applications folder for easy installation"
|
||||
tickets: [8052]
|
||||
|
||||
- title: "Catalog generation: CSV/XML catalogs now support custom fields. Also write UTF-8 BOM to CSV output file."
|
||||
tickets: [8014]
|
||||
|
||||
- title: "EPUB/MOBI catalogs: Various new features"
|
||||
description: "Added a custom field/value for excluding books, OR'd with existing tag list. Added a thumbnail width hint, from 1.0 - 2.0 inches. Deprecated support for special note tag '*', added support for custom column containing note to be inserted in Description header. Added 'Merge with comments' feature, which non-destructively combines Comments with a custom field when generating Descriptions. Moved Description header into a user-editable template file. All fields except thumb and comments accessible to template."
|
||||
tickets: [7820, 5297, 6765]
|
||||
|
||||
- title: "SONY driver: Allow the creation of an All by Something category via the tweaks."
|
||||
|
||||
- title: "Add a tweak to control the delay when sending mails using gmail or hotmail."
|
||||
tickets: [8064]
|
||||
|
||||
- title: "Add output encoding option for TXT/PDB/PMLX output plugins to the GUI"
|
||||
|
||||
- title: "Add an environment variable to control the temporary directory calibre uses"
|
||||
|
||||
- title: "Use the new HTML editor widget for comments custom columns as well"
|
||||
|
||||
- title: "Content server: Fix regression that broke saved searches"
|
||||
tickets: [8047]
|
||||
|
||||
- title: "E-book viewer: Fix regression that broke previous page button"
|
||||
|
||||
- title: "Add a tweak to allow double clicking on the book list to open the edit metadata dialog"
|
||||
tickets: [8032]
|
||||
|
||||
- title: "Add a tweak to use a template for formatting SONY collection names"
|
||||
tickets: [8033]
|
||||
|
||||
- title: "Bulk edit metadata, search and replace: Show all values for multiple fields in the text region, separated by :::"
|
||||
tickets: [8030]
|
||||
|
||||
- title: "Update user agent used by calibre when connecting to websites"
|
||||
|
||||
bug fixes:
|
||||
- title: "FB2 Output: Fix regression that broke images in generated FB2 files"
|
||||
tickets: [8142]
|
||||
|
||||
- title: "When unzipping zip files that contain filenames with unknown character encoding, sanitize the filenames correctly"
|
||||
tickets: [8050]
|
||||
|
||||
- title: "TCR Output: Fix TCR compression adding junk to the end of the text. Remove compression level option."
|
||||
|
||||
- title: "PDF Output: Fix regression that broke the margin options."
|
||||
|
||||
- title: "FB2 Input: Handle non UTF-8 encodings on OS X"
|
||||
tickets: [8115]
|
||||
|
||||
- title: "SNB Input: Better error handling if some metadata is missing in the SNB file. Add Wi-Fi connection support for the Bambook"
|
||||
|
||||
- title: "Allow hyperlinks to be clicked in comments metadata in the book details panel"
|
||||
tickets: [8054]
|
||||
|
||||
improved recipes:
|
||||
- Brand Eins
|
||||
- Volksrant
|
||||
- Smithsonian
|
||||
- Business World
|
||||
- El Universal
|
||||
- Salon
|
||||
- The Week
|
||||
- EL Pais
|
||||
- Wired Magazine
|
||||
- Heraldo de Aragon
|
||||
|
||||
new recipes:
|
||||
- title: "Karlsruhe News"
|
||||
author: "tfeld"
|
||||
|
||||
- title: "El Periodico and Red Aragon"
|
||||
author: "desUBIKado"
|
||||
|
||||
- title: "Business Insider"
|
||||
author: "Darko Miletic"
|
||||
|
||||
- version: 0.7.35
|
||||
date: 2010-12-23
|
||||
|
||||
|
39
resources/catalog/section_list_templates.py
Normal file
39
resources/catalog/section_list_templates.py
Normal file
@ -0,0 +1,39 @@
|
||||
#!/usr/bin/env python
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2010, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
'''
|
||||
Available fields:
|
||||
{title} Title of the book
|
||||
{series} Series name
|
||||
{series_index} Number of the book in the series
|
||||
{rating} Rating
|
||||
{rating_parens} Rating, in parentheses
|
||||
{pubyear} Year the book was published
|
||||
{pubyear_parens} Year the book was published, in parentheses
|
||||
'''
|
||||
# Books by Author
|
||||
by_authors_normal_title_template = '{title} {pubyear_parens}'
|
||||
by_authors_series_title_template = '[{series_index}] {title} {pubyear_parens}'
|
||||
|
||||
# Books by Title
|
||||
by_titles_normal_title_template = '{title}'
|
||||
by_titles_series_title_template = '{title} ({series} [{series_index}])'
|
||||
|
||||
# Books by Series
|
||||
by_series_title_template = '[{series_index}] {title} {pubyear_parens}'
|
||||
|
||||
# Books by Genre
|
||||
by_genres_normal_title_template = '{title} {pubyear_parens}'
|
||||
by_genres_series_title_template = '{series_index}. {title} {pubyear_parens}'
|
||||
|
||||
# Recently Added
|
||||
by_recently_added_normal_title_template = '{title}'
|
||||
by_recently_added_series_title_template = '{title} ({series} [{series_index}])'
|
||||
|
||||
# By Month added
|
||||
by_month_added_normal_title_template = '{title} {pubyear_parens}'
|
||||
by_month_added_series_title_template = '[{series_index}] {title} {pubyear_parens}'
|
@ -2,19 +2,29 @@ body { background-color: white; }
|
||||
|
||||
p.title {
|
||||
margin-top:0em;
|
||||
margin-bottom:1em;
|
||||
margin-bottom:0em;
|
||||
text-align:center;
|
||||
font-style:italic;
|
||||
font-size:xx-large;
|
||||
border-bottom: solid black 2px;
|
||||
}
|
||||
|
||||
p.series_id {
|
||||
margin-top:0em;
|
||||
margin-bottom:0em;
|
||||
text-align:center;
|
||||
}
|
||||
|
||||
a.series_id {
|
||||
font-style:normal;
|
||||
font-size:large;
|
||||
}
|
||||
|
||||
p.author {
|
||||
font-size:large;
|
||||
margin-top:0em;
|
||||
margin-bottom:0em;
|
||||
text-align: center;
|
||||
text-indent: 0em;
|
||||
font-size:large;
|
||||
}
|
||||
|
||||
p.author_index {
|
||||
@ -26,7 +36,8 @@ p.author_index {
|
||||
text-indent: 0em;
|
||||
}
|
||||
|
||||
p.tags {
|
||||
p.genres {
|
||||
font-style:normal;
|
||||
margin-top:0.5em;
|
||||
margin-bottom:0em;
|
||||
text-align: left;
|
||||
@ -108,6 +119,13 @@ p.date_read {
|
||||
text-indent:-6em;
|
||||
}
|
||||
|
||||
hr.annotations_divider {
|
||||
width:50%;
|
||||
margin-left:1em;
|
||||
margin-top:0em;
|
||||
margin-bottom:0em;
|
||||
}
|
||||
|
||||
hr.description_divider {
|
||||
width:90%;
|
||||
margin-left:5%;
|
||||
@ -117,20 +135,37 @@ hr.description_divider {
|
||||
border-left: solid white 0px;
|
||||
}
|
||||
|
||||
hr.annotations_divider {
|
||||
width:50%;
|
||||
margin-left:1em;
|
||||
margin-top:0em;
|
||||
margin-bottom:0em;
|
||||
hr.header_divider {
|
||||
width:100%;
|
||||
border-top: solid white 1px;
|
||||
border-right: solid white 0px;
|
||||
border-bottom: solid black 2px;
|
||||
border-left: solid white 0px;
|
||||
}
|
||||
|
||||
hr.merged_comments_divider {
|
||||
width:80%;
|
||||
margin-left:10%;
|
||||
border-top: solid white 0px;
|
||||
border-right: solid white 0px;
|
||||
border-bottom: dashed gray 2px;
|
||||
border-left: solid white 0px;
|
||||
}
|
||||
|
||||
td.publisher, td.date {
|
||||
font-weight:bold;
|
||||
text-align:center;
|
||||
}
|
||||
td.rating {
|
||||
text-align: center;
|
||||
|
||||
td.rating{
|
||||
text-align:center;
|
||||
}
|
||||
|
||||
td.notes {
|
||||
font-size: 100%;
|
||||
text-align:center;
|
||||
}
|
||||
|
||||
td.thumbnail img {
|
||||
-webkit-box-shadow: 4px 4px 12px #999;
|
||||
}
|
41
resources/catalog/template.xhtml
Normal file
41
resources/catalog/template.xhtml
Normal file
@ -0,0 +1,41 @@
|
||||
<html xmlns="{xmlns}">
|
||||
<head>
|
||||
<title>{title_str}</title>
|
||||
<meta name="catalog description header" http-equiv="Content-Type" content="text/html; charset=UTF-8" />
|
||||
<link rel="stylesheet" type="text/css" href="stylesheet.css" media="screen" />
|
||||
</head>
|
||||
<body>
|
||||
<p class="title">{title}</p>
|
||||
<p class="series_id"><a class="series_id">{series} [{series_index}]</a></p>
|
||||
<hr class="header_divider" />
|
||||
<p class="author">{author_prefix}<a class="author">{author}</a></p>
|
||||
<p class="genres">{genres}</p>
|
||||
<p class="formats">{formats}</p>
|
||||
<table width="100%" border="0">
|
||||
<tr>
|
||||
<td class="thumbnail" rowspan="7">{thumb}</td>
|
||||
<td class="empty"></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td class="empty"></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td class="publisher">{publisher}</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td class="date">{pubyear}</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td class="rating">{rating}</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td class="notes">{note_source}: {note_content}</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td></td>
|
||||
</tr>
|
||||
</table>
|
||||
<hr class="description_divider" />
|
||||
<div class="description">{comments}</div>
|
||||
</body>
|
||||
</html>
|
@ -55,6 +55,27 @@ author_sort_copy_method = 'invert'
|
||||
# categories_use_field_for_author_name = 'author_sort'
|
||||
categories_use_field_for_author_name = 'author'
|
||||
|
||||
# When partitioning the tags browser, the format of the subcategory label is
|
||||
# controlled by a template: categories_collapsed_name_template if sorting by
|
||||
# name, categories_collapsed_rating_template if sorting by average rating, and
|
||||
# categories_collapsed_popularity_template if sorting by popularity. There are
|
||||
# two variables available to the template: first and last. The variable 'first'
|
||||
# is the initial item in the subcategory, and the variable 'last' is the final
|
||||
# item in the subcategory. Both variables are 'objects'; they each have multiple
|
||||
# values that are obtained by using a suffix. For example, first.name for an
|
||||
# author category will be the name of the author. The sub-values available are:
|
||||
# name: the printable name of the item
|
||||
# count: the number of books that references this item
|
||||
# avg_rating: the averate rating of all the books referencing this item
|
||||
# sort: the sort value. For authors, this is the author_sort for that author
|
||||
# category: the category (e.g., authors, series) that the item is in.
|
||||
# Note that the "r'" in front of the { is necessary if there are backslashes
|
||||
# (\ characters) in the template. It doesn't hurt anything to leave it there
|
||||
# even if there aren't any backslashes.
|
||||
categories_collapsed_name_template = r'{first.sort:shorten(4,'',0)} - {last.sort:shorten(4,'',0)}'
|
||||
categories_collapsed_rating_template = r'{first.avg_rating:4.2f:ifempty(0)} - {last.avg_rating:4.2f:ifempty(0)}'
|
||||
categories_collapsed_popularity_template = r'{first.count:d} - {last.count:d}'
|
||||
|
||||
|
||||
# Set whether boolean custom columns are two- or three-valued.
|
||||
# Two-values for true booleans
|
||||
@ -289,3 +310,11 @@ locale_for_sorting = ''
|
||||
# metadata one book at a time. If True, then the fields are laid out using two
|
||||
# columns. If False, one column is used.
|
||||
metadata_single_use_2_cols_for_custom_fields = True
|
||||
|
||||
# The number of seconds to wait before sending emails when using a
|
||||
# public email server like gmail or hotmail. Default is: 5 minutes
|
||||
# Setting it to lower may cause the server's SPAM controls to kick in,
|
||||
# making email sending fail. Changes will take effect only after a restart of
|
||||
# calibre.
|
||||
public_smtp_relay_delay = 301
|
||||
|
||||
|
BIN
resources/images/document-encrypt.png
Normal file
BIN
resources/images/document-encrypt.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 8.8 KiB |
BIN
resources/images/news/arabian_business.png
Normal file
BIN
resources/images/news/arabian_business.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 4.1 KiB |
BIN
resources/images/news/exiled.png
Normal file
BIN
resources/images/news/exiled.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 1.3 KiB |
BIN
resources/images/news/pressthink.png
Normal file
BIN
resources/images/news/pressthink.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 533 B |
BIN
resources/images/news/zerohedge.png
Normal file
BIN
resources/images/news/zerohedge.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 3.0 KiB |
BIN
resources/images/template_funcs.png
Normal file
BIN
resources/images/template_funcs.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 16 KiB |
86
resources/recipes/arabian_business.recipe
Normal file
86
resources/recipes/arabian_business.recipe
Normal file
@ -0,0 +1,86 @@
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2011, Darko Miletic <darko.miletic at gmail.com>'
|
||||
'''
|
||||
www.arabianbusiness.com
|
||||
'''
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class Arabian_Business(BasicNewsRecipe):
|
||||
title = 'Arabian Business'
|
||||
__author__ = 'Darko Miletic'
|
||||
description = 'Comprehensive Guide to Middle East Business & Gulf Industry News including,Banking & Finance,Construction,Energy,Media & Marketing,Real Estate,Transportation,Travel,Technology,Politics,Healthcare,Lifestyle,Jobs & UAE guide.Top Gulf & Dubai Business News.'
|
||||
publisher = 'Arabian Business Publishing Ltd.'
|
||||
category = 'ArabianBusiness.com,Arab Business News,Middle East Business News,Middle East Business,Arab Media News,Industry Events,Middle East Industry News,Arab Business Industry,Dubai Business News,Financial News,UAE Business News,Middle East Press Releases,Gulf News,Arab News,GCC Business News,Banking Finance,Media Marketing,Construction,Oil Gas,Retail,Transportation,Travel Hospitality,Photos,Videos,Life Style,Fashion,United Arab Emirates,UAE,Dubai,Sharjah,Abu Dhabi,Qatar,KSA,Saudi Arabia,Bahrain,Kuwait,Oman,Europe,South Asia,America,Asia,news'
|
||||
oldest_article = 2
|
||||
max_articles_per_feed = 200
|
||||
no_stylesheets = True
|
||||
encoding = 'utf8'
|
||||
use_embedded_content = False
|
||||
language = 'en'
|
||||
remove_empty_feeds = True
|
||||
publication_type = 'newsportal'
|
||||
masthead_url = 'http://www.arabianbusiness.com/skins/ab.main/gfx/arabianbusiness_logo_sm.gif'
|
||||
extra_css = """
|
||||
body{font-family: Georgia,serif }
|
||||
img{margin-bottom: 0.4em; margin-top: 0.4em; display:block}
|
||||
.byline,.dateline{font-size: small; display: inline; font-weight: bold}
|
||||
ul{list-style: none outside none;}
|
||||
"""
|
||||
|
||||
conversion_options = {
|
||||
'comment' : description
|
||||
, 'tags' : category
|
||||
, 'publisher' : publisher
|
||||
, 'language' : language
|
||||
}
|
||||
|
||||
remove_tags_before=dict(attrs={'id':'article-title'})
|
||||
remove_tags = [
|
||||
dict(name=['meta','link','base','iframe','embed','object'])
|
||||
,dict(attrs={'class':'printfooter'})
|
||||
]
|
||||
remove_attributes=['lang']
|
||||
|
||||
|
||||
feeds = [
|
||||
(u'Africa' , u'http://www.arabianbusiness.com/world/Africa/?service=rss' )
|
||||
,(u'Americas' , u'http://www.arabianbusiness.com/world/americas/?service=rss' )
|
||||
,(u'Asia Pacific' , u'http://www.arabianbusiness.com/world/asia-pacific/?service=rss' )
|
||||
,(u'Europe' , u'http://www.arabianbusiness.com/world/europe/?service=rss' )
|
||||
,(u'Middle East' , u'http://www.arabianbusiness.com/world/middle-east/?service=rss' )
|
||||
,(u'South Asia' , u'http://www.arabianbusiness.com/world/south-asia/?service=rss' )
|
||||
,(u'Banking & Finance', u'http://www.arabianbusiness.com/industries/banking-finance/?service=rss' )
|
||||
,(u'Construction' , u'http://www.arabianbusiness.com/industries/construction/?service=rss' )
|
||||
,(u'Education' , u'http://www.arabianbusiness.com/industries/education/?service=rss' )
|
||||
,(u'Energy' , u'http://www.arabianbusiness.com/industries/energy/?service=rss' )
|
||||
,(u'Healthcare' , u'http://www.arabianbusiness.com/industries/healthcare/?service=rss' )
|
||||
,(u'Media' , u'http://www.arabianbusiness.com/industries/media/?service=rss' )
|
||||
,(u'Real Estate' , u'http://www.arabianbusiness.com/industries/real-estate/?service=rss' )
|
||||
,(u'Retail' , u'http://www.arabianbusiness.com/industries/retail/?service=rss' )
|
||||
,(u'Technology' , u'http://www.arabianbusiness.com/industries/technology/?service=rss' )
|
||||
,(u'Transport' , u'http://www.arabianbusiness.com/industries/transport/?service=rss' )
|
||||
,(u'Travel' , u'http://www.arabianbusiness.com/industries/travel-hospitality/?service=rss')
|
||||
,(u'Equities' , u'http://www.arabianbusiness.com/markets/equities/?service=rss' )
|
||||
,(u'Commodities' , u'http://www.arabianbusiness.com/markets/commodities/?service=rss' )
|
||||
,(u'Currencies' , u'http://www.arabianbusiness.com/markets/currencies/?service=rss' )
|
||||
,(u'Market Data' , u'http://www.arabianbusiness.com/markets/market-data/?service=rss' )
|
||||
,(u'Comment' , u'http://www.arabianbusiness.com/opinion/comment/?service=rss' )
|
||||
,(u'Think Tank' , u'http://www.arabianbusiness.com/opinion/think-tank/?service=rss' )
|
||||
,(u'Arts' , u'http://www.arabianbusiness.com/lifestyle/arts/?service=rss' )
|
||||
,(u'Cars' , u'http://www.arabianbusiness.com/lifestyle/cars/?service=rss' )
|
||||
,(u'Food' , u'http://www.arabianbusiness.com/lifestyle/food/?service=rss' )
|
||||
,(u'Sport' , u'http://www.arabianbusiness.com/lifestyle/sport/?service=rss' )
|
||||
]
|
||||
|
||||
def print_version(self, url):
|
||||
return url + '?service=printer&page='
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
for item in soup.findAll(style=True):
|
||||
del item['style']
|
||||
for alink in soup.findAll('a'):
|
||||
if alink.string is not None:
|
||||
tstr = alink.string
|
||||
alink.replaceWith(tstr)
|
||||
return soup
|
@ -5,7 +5,7 @@ __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
'''
|
||||
theatlantic.com
|
||||
'''
|
||||
import string, re
|
||||
import re
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
from calibre.ebooks.BeautifulSoup import Tag, NavigableString
|
||||
@ -33,25 +33,27 @@ class TheAtlantic(BasicNewsRecipe):
|
||||
articles = []
|
||||
|
||||
soup = self.index_to_soup(self.INDEX)
|
||||
sectit = soup.find('h1', attrs={'class':'sectionTitle'})
|
||||
if sectit is not None:
|
||||
texts = self.tag_to_string(sectit).strip().split()[-2:]
|
||||
if texts:
|
||||
self.timefmt = ' [%s]'%(' '.join(texts))
|
||||
ts = soup.find(id='magazineTopStories')
|
||||
ds = self.tag_to_string(ts.find('h1')).split(':')[-1]
|
||||
self.timefmt = ' [%s]'%ds
|
||||
|
||||
cover = soup.find('img', src=True, attrs={'class':'cover'})
|
||||
if cover is not None:
|
||||
self.cover_url = cover['src']
|
||||
|
||||
feeds = []
|
||||
seen_titles = set([])
|
||||
for section in soup.findAll('div', attrs={'class':'magazineSection'}):
|
||||
section_title = section.find(attrs={'class':'sectionHeader'})
|
||||
section_title = string.capwords(self.tag_to_string(section_title))
|
||||
section_title = self.tag_to_string(section.find('h2'))
|
||||
self.log('Found section:', section_title)
|
||||
articles = []
|
||||
for post in section.findAll('div', attrs={'class':'post'}):
|
||||
for post in section.findAll('div', attrs={'class':lambda x : x and
|
||||
'post' in x}):
|
||||
h = post.find(['h3', 'h4'])
|
||||
title = self.tag_to_string(h)
|
||||
if title in seen_titles:
|
||||
continue
|
||||
seen_titles.add(title)
|
||||
a = post.find('a', href=True)
|
||||
url = a['href']
|
||||
if url.startswith('/'):
|
||||
@ -64,36 +66,23 @@ class TheAtlantic(BasicNewsRecipe):
|
||||
self.log('\t\t', desc)
|
||||
articles.append({'title':title, 'url':url, 'description':desc,
|
||||
'date':''})
|
||||
feeds.append((section_title, articles))
|
||||
if articles:
|
||||
feeds.append((section_title, articles))
|
||||
|
||||
poems = []
|
||||
self.log('Found section: Poems')
|
||||
for poem in soup.findAll('div', attrs={'class':'poem'}):
|
||||
title = self.tag_to_string(poem.find('h4'))
|
||||
desc = self.tag_to_string(poem.find(attrs={'class':'author'}))
|
||||
pd = soup.find('h2', text='Poetry').parent.parent
|
||||
for poem in pd.findAll('h4'):
|
||||
title = self.tag_to_string(poem)
|
||||
url = poem.find('a')['href']
|
||||
if url.startswith('/'):
|
||||
url = 'http://www.theatlantic.com' + url
|
||||
self.log('\tFound article:', title, 'at', url)
|
||||
self.log('\t\t', desc)
|
||||
poems.append({'title':title, 'url':url, 'description':desc,
|
||||
poems.append({'title':title, 'url':url, 'description':'',
|
||||
'date':''})
|
||||
if poems:
|
||||
feeds.append(('Poems', poems))
|
||||
|
||||
div = soup.find(id='advice')
|
||||
if div is not None:
|
||||
self.log('Found section: Advice')
|
||||
title = self.tag_to_string(div.find('h4'))
|
||||
url = div.find('a')['href']
|
||||
if url.startswith('/'):
|
||||
url = 'http://www.theatlantic.com' + url
|
||||
desc = self.tag_to_string(div.find('p'))
|
||||
self.log('\tFound article:', title, 'at', url)
|
||||
self.log('\t\t', desc)
|
||||
|
||||
feeds.append(('Advice', [{'title':title, 'url':url, 'description':desc,
|
||||
'date':''}]))
|
||||
return feeds
|
||||
|
||||
def postprocess_html(self, soup, first):
|
||||
|
186
resources/recipes/baltimore_sun.recipe
Normal file
186
resources/recipes/baltimore_sun.recipe
Normal file
@ -0,0 +1,186 @@
|
||||
from __future__ import with_statement
|
||||
__license__ = 'GPL 3'
|
||||
__copyright__ = 'Original 2009, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__copyright__= 'Modified 2011, Josh Hall <jwtheiv@gmail.com>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
'''
|
||||
www.baltimoresun.com
|
||||
'''
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class BaltimoreSun(BasicNewsRecipe):
|
||||
|
||||
title = 'The Baltimore Sun'
|
||||
__author__ = 'Josh Hall'
|
||||
description = 'Politics, local and business news from Baltimore'
|
||||
language = 'en'
|
||||
oldest_article = 1
|
||||
max_articles_per_feed = 100
|
||||
remove_empty_feeds = True
|
||||
use_embedded_content = False
|
||||
no_stylesheets = True
|
||||
remove_javascript = True
|
||||
#masthead_url = 'http://www.baltimoresun.com/images/thirdpartylogo.gif'
|
||||
|
||||
remove_tags_before = dict(name='div', attrs={'class':['story', 'entry']})
|
||||
remove_tags_after = [
|
||||
{'class':['photo_article',]},
|
||||
dict(name='div', attrs={'class':'shirttail-promo right clearfix'}),
|
||||
]
|
||||
|
||||
keep_only_tags = [dict(name='div', attrs={'class':["story","entry-asset asset hentry"]}),
|
||||
dict(name='div', attrs={'id':["pagebody","story","maincontentcontainer"]}),
|
||||
]
|
||||
|
||||
|
||||
remove_tags = [{'id':["moduleArticleTools","content-bottom","rail","articleRelates module","toolSet","relatedrailcontent","div-wrapper","beta","atp-comments","footer","article-promo"]},
|
||||
{'class':["entry-footer-left","entry-footer-right","shirttail-promo right clearfix","clearfix","relatedTitle","articleRelates module","asset-footer","tools","comments","featurePromo","featurePromo fp-topjobs brownBackground","clearfix fullSpan brownBackground","curvedContent","toppaginate","module","module-header","module-content"]},
|
||||
dict(name='font',attrs={'id':["cr-other-headlines"]}),
|
||||
dict(name=['iframe']),
|
||||
]
|
||||
extra_css = '''
|
||||
h1{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:large;}
|
||||
h2{font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:small;}
|
||||
.byline {font-family:Arial,Helvetica,sans-serif; font-size:xx-small;}
|
||||
.date {font-family:Arial,Helvetica,sans-serif; font-size:xx-small;}
|
||||
p{font-family:Arial,Helvetica,sans-serif;font-size:small;}
|
||||
.copyright {font-family:Arial,Helvetica,sans-serif;font-size:xx-small;text-align:center}
|
||||
.story{font-family:Arial,Helvetica,sans-serif;font-size:small;}
|
||||
.entry-asset asset hentry{font-family:Arial,Helvetica,sans-serif;font-size:small;}
|
||||
.pagebody{font-family:Arial,Helvetica,sans-serif;font-size:small;}
|
||||
.maincontentcontainer{font-family:Arial,Helvetica,sans-serif;font-size:small;}
|
||||
.story-body{font-family:Arial,Helvetica,sans-serif;font-size:small;}
|
||||
body{font-family:Helvetica,Arial,sans-serif;font-size:small;}
|
||||
'''
|
||||
feeds = [
|
||||
(u'Top Headlines', u'http://www.baltimoresun.com/rss2.0.xml'),
|
||||
(u'Breaking News', u'http://www.baltimoresun.com/news/breaking/rss2.0.xml'),
|
||||
(u'Top Maryland', u'http://www.baltimoresun.com/news/maryland/rss2.0.xml'),
|
||||
#(u'Anne Arundel County', u'http://www.baltimoresun.com/news/maryland/anne-arundel/rss2.0.xml'),
|
||||
(u'Baltimore City', u'http://www.baltimoresun.com/news/maryland/baltimore-city/rss2.0.xml'),
|
||||
#(u'Baltimore County', u'http://www.baltimoresun.com/news/maryland/baltimore-county/rss2.0.xml'),
|
||||
#(u'Carroll County', u'http://www.baltimoresun.com/news/maryland/carroll/rss2.0.xml'),
|
||||
#(u'Harford County', u'http://www.baltimoresun.com/news/maryland/harford/rss2.0.xml'),
|
||||
#(u'Howard County', u'http://www.baltimoresun.com/news/maryland/howard/rss2.0.xml'),
|
||||
(u'Education', u'http://www.baltimoresun.com/news/education/rss2.0.xml'),
|
||||
#(u'Obituaries', u'http://www.baltimoresun.com/news/obituaries/rss2.0.xml'),
|
||||
(u'Local Politics', u'http://www.baltimoresun.com/news/maryland/politics/rss2.0.xml'),
|
||||
(u'Weather', u'http://www.baltimoresun.com/news/weather/rss2.0.xml'),
|
||||
#(u'Traffic', u'http://www.baltimoresun.com/features/commuting/rss2.0.xml'),
|
||||
(u'Nation/world', u'http://feeds.chicagotribune.com/chicagotribune/news/nationworld/'),
|
||||
(u'Weird News', u'http://www.baltimoresun.com/news/offbeat/rss2.0.xml'),
|
||||
|
||||
|
||||
(u'Top Sports', u'http://www.baltimoresun.com/sports/rss2.0.xml'),
|
||||
(u'Orioles/Baseball', u'http://www.baltimoresun.com/sports/orioles/rss2.0.xml'),
|
||||
(u'Ravens/Football', u'http://www.baltimoresun.com/sports/ravens/rss2.0.xml'),
|
||||
#(u'Terps', u'http://www.baltimoresun.com/sports/terps/rss2.0.xml'),
|
||||
#(u'College Football', u'http://www.baltimoresun.com/sports/college/football/rss2.0.xml'),
|
||||
#(u'Lacrosse', u'http://www.baltimoresun.com/sports/college/lacrosse/rss2.0.xml'),
|
||||
#(u'Horse Racing', u'http://www.baltimoresun.com/sports/horse-racing/rss2.0.xml'),
|
||||
#(u'Golf', u'http://www.baltimoresun.com/sports/golf/rss2.0.xml'),
|
||||
#(u'NBA', u'http://www.baltimoresun.com/sports/nba/rss2.0.xml'),
|
||||
#(u'High School', u'http://www.baltimoresun.com/sports/high-school/rss2.0.xml'),
|
||||
#(u'Outdoors', u'http://www.baltimoresun.com/sports/outdoors/rss2.0.xml'),
|
||||
|
||||
(u'Celebrity News', u'http://www.baltimoresun.com/entertainment/celebrities/rss2.0.xml'),
|
||||
(u'Arts & Theater', u'http://www.baltimoresun.com/entertainment/arts/rss2.0.xml'),
|
||||
(u'Movies', u'http://www.baltimoresun.com/entertainment/movies/rss2.0.xml'),
|
||||
(u'Music & Nightlife', u'http://www.baltimoresun.com/entertainment/music/rss2.0.xml'),
|
||||
(u'Restaurants & Food', u'http://www.baltimoresun.com/entertainment/dining/rss2.0.xml'),
|
||||
(u'TV/Media', u'http://www.baltimoresun.com/entertainment/tv/rss2.0.xml'),
|
||||
|
||||
(u'Health&Wellness', u'http://www.baltimoresun.com/health/rss2.0.xml'),
|
||||
(u'Home & Garden', u'http://www.baltimoresun.com/features/home-garden/rss2.0.xml'),
|
||||
(u'Living Green', u'http://www.baltimoresun.com/features/green/rss2.0.xml'),
|
||||
(u'Parenting', u'http://www.baltimoresun.com/features/parenting/rss2.0.xml'),
|
||||
(u'Fashion', u'http://www.baltimoresun.com/features/fashion/rss2.0.xml'),
|
||||
(u'Travel', u'http://www.baltimoresun.com/travel/rss2.0.xml'),
|
||||
(u'Faith', u'http://www.baltimoresun.com/features/faith/rss2.0.xml'),
|
||||
|
||||
(u'Top Business', u'http://www.baltimoresun.com/business/rss2.0.xml'),
|
||||
(u'Technology', u'http://www.baltimoresun.com/business/technology/rss2.0.xml'),
|
||||
(u'Personal finance', u'http://www.baltimoresun.com/business/money/rss2.0.xml'),
|
||||
(u'Real Estate', u'http://www.baltimoresun.com/classified/realestate/rss2.0.xml'),
|
||||
(u'Jobs', u'http://www.baltimoresun.com/classified/jobs/rss2.0.xml'),
|
||||
(u'DIY', u'http://www.baltimoresun.com/features/do-it-yourself/rss2.0.xml'),
|
||||
(u'Consumer Safety', u'http://www.baltimoresun.com/business/consumer-safety/rss2.0.xml'),
|
||||
(u'Investing', u'http://www.baltimoresun.com/business/money/rss2.0.xml'),
|
||||
|
||||
(u'Sun Editorials', u'http://www.baltimoresun.com/news/opinion/editorial/rss2.0.xml'),
|
||||
(u'Op/Ed', u'http://www.baltimoresun.com/news/opinion/oped/rss2.0.xml'),
|
||||
(u'Readers Respond', u'http://www.baltimoresun.com/news/opinion/readersrespond/'),
|
||||
|
||||
(u'Kevin Cowherd', 'http://www.baltimoresun.com/sports/bal-columnist-cowherd,0,6829726.columnist-rss2.0.xml'),
|
||||
(u'Jay Hancock', u'http://www.baltimoresun.com/business/money/bal-columnist-hancock,0,6673611.columnist-rss2.0.xml'),
|
||||
(u'Jacques Kelly', u'http://www.baltimoresun.com/news/maryland/bal-columnist-kelly,0,1154701.columnist-rss2.0.xml'),
|
||||
(u'Marta H. Mossburg', u'http://www.baltimoresun.com/news/opinion/oped/bal-columnist-mossburg,0,7982155.columnist-rss2.0.xml'),
|
||||
(u'Mike Preston', u'http://www.baltimoresun.com/sports/bal-columnist-preston,0,6169796.columnist-rss2.0.xml'),
|
||||
(u'Susan Reimer', u'http://www.baltimoresun.com/news/opinion/bal-columnist-reimer,0,162466.columnist-rss2.0.xml'),
|
||||
(u'Dan Rodricks', u'http://www.baltimoresun.com/news/maryland/bal-columnist-rodricks,0,7089843.columnist-rss2.0.xml'),
|
||||
(u'Thomas F. Schaller', u'http://www.baltimoresun.com/news/opinion/columnists/bal-columnist-schaller,0,897397.columnist-rss2.0.xml'),
|
||||
(u'Peter Schmuck', u'http://www.baltimoresun.com/sports/bal-columnist-schmuck,0,7485088.columnist-rss2.0.xml'),
|
||||
(u'Ron Smith', u'http://www.baltimoresun.com/news/opinion/bal-columnist-ronsmith,0,3964803.columnist-rss2.0.xml'),
|
||||
|
||||
(u'Baltimore Crime Beat', u'http://weblogs.baltimoresun.com/news/crime/blog/index.xml'),
|
||||
(u'Getting There', u'http://weblogs.baltimoresun.com/news/traffic/index.xml'),
|
||||
(u'InsideEd', u'http://weblogs.baltimoresun.com/news/education/blog/index.xml'),
|
||||
(u'Maryland Politics', u'http://weblogs.baltimoresun.com/news/local/politics/index.xml'),
|
||||
(u'Maryland Weather', u'http://weblogs.marylandweather.com/index.xml'),
|
||||
(u'Second Opinion', u'http://weblogs.baltimoresun.com/news/opinion/index.xml'),
|
||||
(u'You Dont Say', u'http://weblogs.baltimoresun.com/news/mcintyre/blog/index.xml'),
|
||||
|
||||
(u'BaltTech', u'http://weblogs.baltimoresun.com/news/technology/index.xml'),
|
||||
(u'Consuming Interests', u'http://weblogs.baltimoresun.com/business/consuminginterests/blog/index.xml'),
|
||||
(u'Jay Hancocks Blog', u'http://weblogs.baltimoresun.com/business/hancock/blog/index.xml'),
|
||||
(u'The Real Estate Wonk', u'http://weblogs.baltimoresun.com/business/realestate/blog/index.xml'),
|
||||
|
||||
(u'Clef Notes', 'http://weblogs.baltimoresun.com/entertainment/classicalmusic/index.xml'),
|
||||
(u'Dining at Large', u'http://weblogs.baltimoresun.com/entertainment/dining/reviews/blog/index.xml'),
|
||||
(u'Midnight Sun', u'http://weblogs.baltimoresun.com/entertainment/midnight_sun/blog/index.xml'),
|
||||
(u'Mike Sragow Gets Reel', u'http://weblogs.baltimoresun.com/entertainment/movies/blog/index.xml'),
|
||||
(u'Read Street', u'http://weblogs.baltimoresun.com/entertainment/books/blog/index.xml'),
|
||||
(u'Reality Check', u'http://weblogs.baltimoresun.com/entertainment/realitycheck/blog/index.xml'),
|
||||
(u'Z on TV', u'http://weblogs.baltimoresun.com/entertainment/zontv/index.xml'),
|
||||
|
||||
(u'BMore Green', u'http://weblogs.baltimoresun.com/features/green/index.xml'),
|
||||
(u'Charm City Moms', u'http://weblogs.baltimoresun.com/features/baltimoremomblog/index.xml'),
|
||||
(u'Exercists', u'http://weblogs.baltimoresun.com/health/fitness/index.xml'),
|
||||
(u'Garden Variety', 'http://weblogs.baltimoresun.com/features/gardening/index.xml'),
|
||||
#(u'In Good Faith', u'http://weblogs.baltimoresun.com/news/faith/index.xml'),
|
||||
(u'Picture of Health', u'http://weblogs.baltimoresun.com/health/index.xml'),
|
||||
(u'Unleashed', u'http://weblogs.baltimoresun.com/features/mutts/blog/index.xml'),
|
||||
|
||||
#(u'Faceoff', u'http://weblogs.baltimoresun.com/sports/lacrosse/blog/index.xml'),
|
||||
#(u'MMA Stomping Grounds', u'http://weblogs.baltimoresun.com/sports/mma/blog/index.xml'),
|
||||
(u'Orioles Insider', u'http://weblogs.baltimoresun.com/sports/orioles/blog/index.xml'),
|
||||
#(u'Outdoors Girl', u'http://weblogs.baltimoresun.com/sports/outdoors/blog/index.xml'),
|
||||
(u'Ravens Insider', u'http://weblogs.baltimoresun.com/sports/ravens/blog/index.xml'),
|
||||
#(u'Recruiting Report', u'http://weblogs.baltimoresun.com/sports/college/recruiting/index.xml'),
|
||||
#(u'Ring Posts', u'http://weblogs.baltimoresun.com/sports/wrestling/blog/index.xml'),
|
||||
(u'The Schmuck Stops Here', u'http://weblogs.baltimoresun.com/sports/schmuck/index.xml'),
|
||||
(u'Toy Department', u'http://weblogs.baltimoresun.com/sports/thetoydepartment/index.xml'),
|
||||
#(u'Tracking the Terps', u'http://weblogs.baltimoresun.com/sports/college/maryland_terps/blog/index.xml'),
|
||||
#(u'Varsity Letters', u'http://weblogs.baltimoresun.com/sports/highschool/varsityletters/index.xml'),
|
||||
(u'Virtual Vensanity', u'http://weblogs.baltimoresun.com/entertainment/bthesite/vensel/index.xml'),
|
||||
|
||||
]
|
||||
|
||||
|
||||
def get_article_url(self, article):
|
||||
print article.get('feedburner_origlink', article.get('guid', article.get('link')))
|
||||
return article.get('feedburner_origlink', article.get('guid', article.get('link')))
|
||||
|
||||
|
||||
def postprocess_html(self, soup, first_fetch):
|
||||
for t in soup.findAll(['table', 'tr', 'td']):
|
||||
t.name = 'div'
|
||||
|
||||
for tag in soup.findAll('form', dict(attrs={'name':["comments_form"]})):
|
||||
tag.extract()
|
||||
for tag in soup.findAll('font', dict(attrs={'id':["cr-other-headlines"]})):
|
||||
tag.extract()
|
||||
|
||||
return soup
|
@ -1,19 +1,16 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 mode: python -*-
|
||||
|
||||
# Find the newest version of this recipe here:
|
||||
# https://github.com/consti/BrandEins-Recipe/raw/master/brandeins.recipe
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2010, Constantin Hofstetter <consti at consti.de>, Steffen Siebert <calibre at steffensiebert.de>'
|
||||
__version__ = '0.96'
|
||||
__version__ = '0.97'
|
||||
|
||||
''' http://brandeins.de - Wirtschaftsmagazin '''
|
||||
import re
|
||||
import string
|
||||
from calibre.ebooks.BeautifulSoup import Tag
|
||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||
|
||||
|
||||
class BrandEins(BasicNewsRecipe):
|
||||
|
||||
title = u'brand eins'
|
||||
@ -28,6 +25,8 @@ class BrandEins(BasicNewsRecipe):
|
||||
language = 'de'
|
||||
publication_type = 'magazine'
|
||||
needs_subscription = 'optional'
|
||||
# Prevent that conversion date is appended to title
|
||||
timefmt = ''
|
||||
|
||||
# 2 is the last full magazine (default)
|
||||
# 1 is the newest (but not full)
|
||||
@ -66,6 +65,13 @@ class BrandEins(BasicNewsRecipe):
|
||||
new_p = "<p><i>"+ content +"</i></p>"
|
||||
p.replaceWith(new_p)
|
||||
|
||||
# Change <h3> to <h1>
|
||||
header = soup.find("h3")
|
||||
if header:
|
||||
tag = Tag(soup, "h1")
|
||||
tag.insert(0, header.contents[0])
|
||||
header.replaceWith(tag)
|
||||
|
||||
return soup
|
||||
|
||||
def get_cover(self, soup):
|
||||
@ -77,6 +83,7 @@ class BrandEins(BasicNewsRecipe):
|
||||
|
||||
def parse_index(self):
|
||||
feeds = []
|
||||
issue_map = {}
|
||||
|
||||
archive = "http://www.brandeins.de/archiv.html"
|
||||
|
||||
@ -88,21 +95,31 @@ class BrandEins(BasicNewsRecipe):
|
||||
pass
|
||||
|
||||
soup = self.index_to_soup(archive)
|
||||
latest_jahrgang = soup.findAll('div', attrs={'class': re.compile(r'\bjahrgang-latest\b') })[0].findAll('ul')[0]
|
||||
pre_latest_issue = latest_jahrgang.findAll('a')[len(latest_jahrgang.findAll('a'))-issue]
|
||||
url = pre_latest_issue.get('href', False)
|
||||
# Get month and year of the magazine issue - build it out of the title of the cover
|
||||
self.timefmt = " " + re.search(r"(?P<date>\d\d\/\d\d\d\d)", pre_latest_issue.find('img').get('title', False)).group('date')
|
||||
issue_list = soup.findAll('div', attrs={'class': 'tx-brandeinsmagazine-pi1'})[0].findAll('a')
|
||||
issue_list = [i for i in issue_list if i.get('onmouseover', False)]
|
||||
for i in issue_list:
|
||||
issue_number_string = i.get('onmouseover', False)
|
||||
if issue_number_string:
|
||||
match = re.match("^switch_magazine\(([0-9]+), ([0-9]+)\)$", issue_number_string)
|
||||
issue_number = "%04i%02i" % (int(match.group(1)), int(match.group(2)))
|
||||
issue_map[issue_number] = i
|
||||
keys = issue_map.keys()
|
||||
keys.sort()
|
||||
keys.reverse()
|
||||
selected_issue = issue_map[keys[issue-1]]
|
||||
url = selected_issue.get('href', False)
|
||||
# Get the title for the magazin - build it out of the title of the cover - take the issue and year;
|
||||
self.title = "brand eins "+ re.search(r"(?P<date>\d\d\/\d\d\d\d)", selected_issue.find('img').get('title', False)).group('date')
|
||||
url = 'http://brandeins.de/'+url
|
||||
|
||||
# url = "http://www.brandeins.de/archiv/magazin/tierisch.html"
|
||||
titles_and_articles = self.brand_eins_parse_latest_issue(url)
|
||||
titles_and_articles = self.brand_eins_parse_issue(url)
|
||||
if titles_and_articles:
|
||||
for title, articles in titles_and_articles:
|
||||
feeds.append((title, articles))
|
||||
return feeds
|
||||
|
||||
def brand_eins_parse_latest_issue(self, url):
|
||||
def brand_eins_parse_issue(self, url):
|
||||
soup = self.index_to_soup(url)
|
||||
self.cover_url = self.get_cover(soup)
|
||||
article_lists = [soup.find('div', attrs={'class':'subColumnLeft articleList'}), soup.find('div', attrs={'class':'subColumnRight articleList'})]
|
||||
@ -145,4 +162,3 @@ class BrandEins(BasicNewsRecipe):
|
||||
current_articles.append({'title': title, 'url': url, 'description': description, 'date':''})
|
||||
titles_and_articles.append([chapter_title, current_articles])
|
||||
return titles_and_articles
|
||||
|
||||
|
@ -1,7 +1,5 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
|
||||
__copyright__ = '2009-2010, Darko Miletic <darko.miletic at gmail.com>'
|
||||
'''
|
||||
www.businessworld.in
|
||||
'''
|
||||
@ -22,7 +20,11 @@ class BusinessWorldMagazine(BasicNewsRecipe):
|
||||
use_embedded_content = False
|
||||
encoding = 'utf-8'
|
||||
language = 'en_IN'
|
||||
|
||||
extra_css = """
|
||||
img{display: block; margin-bottom: 0.5em}
|
||||
body{font-family: Arial,Helvetica,sans-serif}
|
||||
h2{color: gray; display: block}
|
||||
"""
|
||||
|
||||
conversion_options = {
|
||||
'comment' : description
|
||||
@ -42,7 +44,26 @@ class BusinessWorldMagazine(BasicNewsRecipe):
|
||||
articles = []
|
||||
linklist = []
|
||||
soup = self.index_to_soup(self.INDEX)
|
||||
|
||||
|
||||
tough = soup.find('div', attrs={'id':'tough'})
|
||||
if tough:
|
||||
for item in tough.findAll('h1'):
|
||||
description = ''
|
||||
title_prefix = ''
|
||||
feed_link = item.find('a')
|
||||
if feed_link and feed_link.has_key('href'):
|
||||
url = self.ROOT + feed_link['href']
|
||||
if not self.is_in_list(linklist,url):
|
||||
title = title_prefix + self.tag_to_string(feed_link)
|
||||
date = strftime(self.timefmt)
|
||||
articles.append({
|
||||
'title' :title
|
||||
,'date' :date
|
||||
,'url' :url
|
||||
,'description':description
|
||||
})
|
||||
linklist.append(url)
|
||||
|
||||
for item in soup.findAll('div', attrs={'class':'nametitle'}):
|
||||
description = ''
|
||||
title_prefix = ''
|
||||
@ -62,8 +83,8 @@ class BusinessWorldMagazine(BasicNewsRecipe):
|
||||
return [(soup.head.title.string, articles)]
|
||||
|
||||
|
||||
keep_only_tags = [dict(name='div', attrs={'id':['register-panel','printwrapper']})]
|
||||
remove_tags = [dict(name=['object','link'])]
|
||||
keep_only_tags = [dict(name='div', attrs={'id':'printwrapper'})]
|
||||
remove_tags = [dict(name=['object','link','meta','base','iframe','link','table'])]
|
||||
|
||||
def print_version(self, url):
|
||||
return url.replace('/bw/','/bw/storyContent/')
|
||||
|
35
resources/recipes/cicero.recipe
Normal file
35
resources/recipes/cicero.recipe
Normal file
@ -0,0 +1,35 @@
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class Cicero(BasicNewsRecipe):
|
||||
timefmt = ' [%Y-%m-%d]'
|
||||
title = u'Cicero'
|
||||
__author__ = 'mad@sharktooth.de'
|
||||
description = u'Magazin f\xfcr politische Kultur'
|
||||
oldest_article = 7
|
||||
language = 'de'
|
||||
max_articles_per_feed = 100
|
||||
no_stylesheets = True
|
||||
use_embedded_content = False
|
||||
publisher = 'Ringier Publishing'
|
||||
category = 'news, politics, Germany'
|
||||
encoding = 'iso-8859-1'
|
||||
publication_type = 'magazine'
|
||||
masthead_url = 'http://www.cicero.de/img2/cicero_logo_rss.gif'
|
||||
feeds = [
|
||||
(u'Das gesamte Portfolio', u'http://www.cicero.de/rss/rss.php?ress_id='),
|
||||
#(u'Alle Heft-Inhalte', u'http://www.cicero.de/rss/rss.php?ress_id=heft'),
|
||||
#(u'Alle Online-Inhalte', u'http://www.cicero.de/rss/rss.php?ress_id=online'),
|
||||
#(u'Berliner Republik', u'http://www.cicero.de/rss/rss.php?ress_id=4'),
|
||||
#(u'Weltb\xfchne', u'http://www.cicero.de/rss/rss.php?ress_id=1'),
|
||||
#(u'Salon', u'http://www.cicero.de/rss/rss.php?ress_id=7'),
|
||||
#(u'Kapital', u'http://www.cicero.de/rss/rss.php?ress_id=6'),
|
||||
#(u'Netzst\xfccke', u'http://www.cicero.de/rss/rss.php?ress_id=9'),
|
||||
#(u'Leinwand', u'http://www.cicero.de/rss/rss.php?ress_id=12'),
|
||||
#(u'Bibliothek', u'http://www.cicero.de/rss/rss.php?ress_id=15'),
|
||||
(u'Kolumne - Alle Kolulmnen', u'http://www.cicero.de/rss/rss2.php?ress_id='),
|
||||
#(u'Kolumne - Schreiber, Berlin', u'http://www.cicero.de/rss/rss2.php?ress_id=35'),
|
||||
#(u'Kolumne - TV Kritik', u'http://www.cicero.de/rss/rss2.php?ress_id=34')
|
||||
]
|
||||
|
||||
def print_version(self, url):
|
||||
return 'http://www.cicero.de/page_print.php?' + url.rpartition('?')[2]
|
@ -11,7 +11,7 @@ class CNetJapan(BasicNewsRecipe):
|
||||
(u'CNet Blog', u'http://feed.japan.cnet.com/rss/blog/index.rdf')
|
||||
]
|
||||
language = 'ja'
|
||||
encoding = 'Shift_JIS'
|
||||
encoding = 'utf-8'
|
||||
remove_javascript = True
|
||||
|
||||
preprocess_regexps = [
|
||||
|
@ -1,5 +1,5 @@
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2008-2010, Darko Miletic <darko.miletic at gmail.com>'
|
||||
__copyright__ = '2008-2011, Darko Miletic <darko.miletic at gmail.com>'
|
||||
'''
|
||||
danas.rs
|
||||
'''
|
||||
@ -33,13 +33,15 @@ class Danas(BasicNewsRecipe):
|
||||
margin-bottom: 0;
|
||||
margin-top: 0}
|
||||
h2,.datum,.lokacija,.autor{font-size: small}
|
||||
.autor{text-transform: uppercase}
|
||||
.antrfileNaslov{border-left: 2px solid #999999;
|
||||
margin-left: 0.8em;
|
||||
padding-left: 1.2em;
|
||||
font-weight:bold;
|
||||
margin-bottom: 0;
|
||||
margin-top: 0}
|
||||
img{margin-bottom: 0.8em}
|
||||
img{margin-bottom: 0.8em}
|
||||
.naslovTemeDana{font-size: small}
|
||||
"""
|
||||
|
||||
conversion_options = {
|
||||
@ -62,6 +64,7 @@ class Danas(BasicNewsRecipe):
|
||||
,(re.compile(u'\u00f4'), lambda match: '“') # latin small letter o with circumflex
|
||||
,(re.compile(u'\u00f6'), lambda match: '”') # latin small letter o with dieaeresis
|
||||
,(re.compile(u'\u00e1'), lambda match: ' ' ) # latin small letter a with acute
|
||||
,(re.compile(u'\u00e4'), lambda match: ' ' ) # latin small letter a with dieaeresis
|
||||
]
|
||||
|
||||
keep_only_tags = [dict(name='div', attrs={'id':'left'})]
|
||||
@ -124,6 +127,6 @@ class Danas(BasicNewsRecipe):
|
||||
cover_url = None
|
||||
soup = self.index_to_soup('http://www.danas.rs/')
|
||||
for citem in soup.findAll('img'):
|
||||
if citem['src'].endswith('naslovna.jpg'):
|
||||
if citem['src'].endswith('naslovna.jpg') or citem['src'].endswith('naslovna1.jpg'):
|
||||
return 'http://www.danas.rs' + citem['src']
|
||||
return cover_url
|
||||
|
70
resources/recipes/deia.recipe
Normal file
70
resources/recipes/deia.recipe
Normal file
@ -0,0 +1,70 @@
|
||||
#!/usr/bin/env python
|
||||
__license__ = 'GPL v3'
|
||||
__author__ = 'Gerardo Diez'
|
||||
__copyright__ = 'Gerardo Diez<gerardo.diez.garcia@gmail.com>'
|
||||
description = 'Main daily newspaper from Spain - v1.00 (05, Enero 2011)'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
'''
|
||||
deia.com
|
||||
'''
|
||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||
|
||||
class Deia(BasicNewsRecipe):
|
||||
title ='Deia'
|
||||
__author__ ='Gerardo Diez'
|
||||
publisher ='Editorial Iparraguirre, S.A'
|
||||
category ='news, politics, finances, world, spain, euskadi'
|
||||
publication_type ='newspaper'
|
||||
oldest_article =1
|
||||
max_articles_per_feed =100
|
||||
simultaneous_downloads =10
|
||||
cover_url ='http://2.bp.blogspot.com/_RjrWzC6tI14/TM6jrPLaBZI/AAAAAAAAFaI/ayffwxidFEY/s1600/2009-10-13-logo-deia.jpg'
|
||||
timefmt ='[%a, %d %b, %Y]'
|
||||
encoding ='utf8'
|
||||
language ='es'
|
||||
remove_javascript =True
|
||||
remove_tags_after =dict(id='Texto')
|
||||
remove_tags_before =dict(id='Texto')
|
||||
remove_tags =[dict(name='div', attrs={'class':['Herramientas ', 'Multimedia']})]
|
||||
no_stylesheets =True
|
||||
extra_css ='h1 {margin-bottom: .15em;font-size: 2.7em; font-family: Georgia, "Times New Roman", Times, serif;} .Antetitulo {margin: 1em 0;text-transform: uppercase;color: #999;} .PieFoto {margin: .1em 0;padding: .5em .5em .5em .5em;background: #F0F0F0;} .PieFoto p {margin-bottom: 0;font-family: Georgia,"Times New Roman",Times,serif;font-weight: bold; font-style: italic; color: #666;}'
|
||||
keep_only_tags =[dict(name='div', attrs={'class':['Texto ', 'NoticiaFicha ']})]
|
||||
feeds = [
|
||||
(u'Bizkaia' ,u'http://www.deia.com/index.php/services/rss?seccion=bizkaia'),
|
||||
(u'Bilbao' ,u'http://www.deia.com/index.php/services/rss?seccion=bilbao'),
|
||||
(u'Hemendik eta Handik' ,u'http://www.deia.com/index.php/services/rss?seccion=hemendik-eta-handik'),
|
||||
(u'Margen Derecha' ,u'http://www.deia.com/index.php/services/rss?seccion=margen-derecha'),
|
||||
(u'Encartaciones y Margen Izquierda' ,u'http://www.deia.com/index.php/services/rss?seccion=margen-izquierda-encartaciones'),
|
||||
(u'Costa' ,u'http://www.deia.com/index.php/services/rss?seccion=costa'),
|
||||
(u'Duranguesado' ,u'http://www.deia.com/index.php/services/rss?seccion=duranguesado'),
|
||||
(u'Llodio-Nervión' ,u'http://www.deia.com/index.php/services/rss?seccion=llodio-nervion'),
|
||||
(u'Arratia-Nervión' ,u'http://www.deia.com/index.php/services/rss?seccion=arratia-nervion'),
|
||||
(u'Uribe-Txorierri' ,u'http://www.deia.com/index.php/services/rss?seccion=uribe-txorierri'),
|
||||
(u'Ecos de sociedad' ,u'http://www.deia.com/index.php/services/rss?seccion=ecos-de-sociedad'),
|
||||
(u'Sucesos' ,u'http://www.deia.com/index.php/services/rss?seccion=sucesos'),
|
||||
(u'Política' ,u'http://www.deia.com/index.php/services/rss?seccion=politica'),
|
||||
(u'Euskadi' ,u'http://www.deia.com/index.php/services/rss?seccion=politica/euskadi'),
|
||||
(u'España' ,u'http://www.deia.com/index.php/services/rss?seccion=politica/espana'),
|
||||
(u'Sociedad',u'http://www.deia.com/index.php/services/rss?seccion=sociedad'),
|
||||
(u'Euskadi' ,u'http://www.deia.com/index.php/services/rss?seccion=socidad/euskadi'),
|
||||
(u'Sociedad.España' ,u'http://www.deia.com/index.php/services/rss?seccion=sociedad/espana'),
|
||||
(u'Ocio y Cultura' ,u'http://www.deia.com/index.php/services/rss?seccion=ocio-y-cultura'),
|
||||
#(u'Cultura' ,u'http://www.deia.com/index.php/services/rss?seccion=cultura'),
|
||||
#(u'Ocio' ,u'http://www.deia.com/index.php/services/rss?seccion=ocio'),
|
||||
(u'On' ,u'http://www.deia.com/index.php/services/rss?seccion=on'),
|
||||
(u'Agenda' ,u'http://www.deia.com/index.php/services/rss?seccion=agenda'),
|
||||
(u'Comunicación' ,u'http://www.deia.com/index.php/services/rss?seccion=comunicacion'),
|
||||
(u'Viajes' ,u'http://www.deia.com/index.php/services/rss?seccion=viajes'),
|
||||
(u'¡Mundo!' ,u'http://www.deia.com/index.php/services/rss?seccion=que-mundo'),
|
||||
(u'Humor' ,u'http://www.deia.com/index.php/services/rss?seccion=humor'),
|
||||
(u'Opinión' ,u'http://www.deia.com/index.php/services/rss?seccion=opinion'),
|
||||
(u'Editorial' ,u'http://www.deia.com/index.php/services/rss?seccion=editorial'),
|
||||
(u'Tribuna abierta' ,u'http://www.deia.com/index.php/services/rss?seccion=tribuna-abierta'),
|
||||
(u'Colaboración' ,u'http://www.deia.com/index.php/services/rss?seccion=colaboracion'),
|
||||
(u'Columnistas' ,u'http://www.deia.com/index.php/services/rss?seccion=columnistas'),
|
||||
(u'Deportes' ,u'http://www.deia.com/index.php/services/rss?seccion=deportes'),
|
||||
(u'Athletic' ,u'http://www.deia.com/index.php/services/rss?seccion=athletic'),
|
||||
(u'Economía' ,'http://www.deia.com/index.php/services/rss?seccion=economia'),
|
||||
(u'Mundo' ,u'http://www.deia.com/index.php/services/rss?seccion=mundo')]
|
||||
|
@ -9,7 +9,7 @@ from calibre.web.feeds.news import BasicNewsRecipe
|
||||
from calibre.ebooks.BeautifulSoup import BeautifulSoup
|
||||
from calibre.ebooks.BeautifulSoup import Tag, NavigableString
|
||||
|
||||
import mechanize, string, urllib, time, re
|
||||
import string, time, re
|
||||
|
||||
class Economist(BasicNewsRecipe):
|
||||
|
||||
@ -18,19 +18,19 @@ class Economist(BasicNewsRecipe):
|
||||
|
||||
__author__ = "Kovid Goyal"
|
||||
INDEX = 'http://www.economist.com/printedition'
|
||||
description = ('Global news and current affairs from a European perspective.'
|
||||
' Needs a subscription from ')+INDEX
|
||||
description = 'Global news and current affairs from a European perspective.'
|
||||
|
||||
oldest_article = 7.0
|
||||
cover_url = 'http://www.economist.com/images/covers/currentcoverus_large.jpg'
|
||||
remove_tags = [dict(name=['script', 'noscript', 'title', 'iframe', 'cf_floatingcontent']),
|
||||
dict(attrs={'class':['dblClkTrk', 'ec-article-info']})]
|
||||
keep_only_tags = [dict(id='ec-article-body')]
|
||||
needs_subscription = True
|
||||
needs_subscription = False
|
||||
no_stylesheets = True
|
||||
preprocess_regexps = [(re.compile('</html>.*', re.DOTALL),
|
||||
lambda x:'</html>')]
|
||||
|
||||
'''
|
||||
def get_browser(self):
|
||||
br = BasicNewsRecipe.get_browser()
|
||||
br.open('http://www.economist.com')
|
||||
@ -50,6 +50,7 @@ class Economist(BasicNewsRecipe):
|
||||
}))
|
||||
br.open(req).read()
|
||||
return br
|
||||
'''
|
||||
|
||||
def parse_index(self):
|
||||
try:
|
||||
|
@ -7,12 +7,12 @@ from lxml import html
|
||||
|
||||
class Economist(BasicNewsRecipe):
|
||||
|
||||
title = 'The Economist (free)'
|
||||
title = 'The Economist (RSS)'
|
||||
language = 'en'
|
||||
|
||||
__author__ = "Kovid Goyal"
|
||||
description = ('Global news and current affairs from a European perspective.'
|
||||
' Much slower than the subscription based version.')
|
||||
' Much slower than the print edition based version.')
|
||||
|
||||
oldest_article = 7.0
|
||||
cover_url = 'http://www.economist.com/images/covers/currentcoverus_large.jpg'
|
||||
|
122
resources/recipes/el_correo.recipe
Normal file
122
resources/recipes/el_correo.recipe
Normal file
@ -0,0 +1,122 @@
|
||||
#!/usr/bin/env python
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '08 Januery 2011, desUBIKado'
|
||||
__author__ = 'desUBIKado'
|
||||
__description__ = 'Daily newspaper from Biscay'
|
||||
__version__ = 'v0.08'
|
||||
__date__ = '08, Januery 2011'
|
||||
'''
|
||||
[url]http://www.elcorreo.com/[/url]
|
||||
'''
|
||||
|
||||
import time
|
||||
import re
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class heraldo(BasicNewsRecipe):
|
||||
__author__ = 'desUBIKado'
|
||||
description = 'Daily newspaper from Biscay'
|
||||
title = u'El Correo'
|
||||
publisher = 'Vocento'
|
||||
category = 'News, politics, culture, economy, general interest'
|
||||
oldest_article = 2
|
||||
delay = 1
|
||||
max_articles_per_feed = 100
|
||||
no_stylesheets = True
|
||||
use_embedded_content = False
|
||||
language = 'es'
|
||||
timefmt = '[%a, %d %b, %Y]'
|
||||
encoding = 'iso-8859-1'
|
||||
remove_empty_feeds = True
|
||||
remove_javascript = False
|
||||
|
||||
feeds = [
|
||||
(u'Portada', u'http://www.elcorreo.com/vizcaya/portada.xml'),
|
||||
(u'Local', u'http://www.elcorreo.com/vizcaya/rss/feeds/vizcaya.xml'),
|
||||
(u'Internacional', u'hhttp://www.elcorreo.com/vizcaya/rss/feeds/internacional.xml'),
|
||||
(u'Econom\xeda', u'http://www.elcorreo.com/vizcaya/rss/feeds/economia.xml'),
|
||||
(u'Pol\xedtica', u'http://www.elcorreo.com/vizcaya/rss/feeds/politica.xml'),
|
||||
(u'Opini\xf3n', u'http://www.elcorreo.com/vizcaya/rss/feeds/opinion.xml'),
|
||||
(u'Deportes', u'http://www.elcorreo.com/vizcaya/rss/feeds/deportes.xml'),
|
||||
(u'Sociedad', u'http://www.elcorreo.com/vizcaya/rss/feeds/sociedad.xml'),
|
||||
(u'Cultura', u'http://www.elcorreo.com/vizcaya/rss/feeds/cultura.xml'),
|
||||
(u'Televisi\xf3n', u'http://www.elcorreo.com/vizcaya/rss/feeds/television.xml'),
|
||||
(u'Gente', u'http://www.elcorreo.com/vizcaya/rss/feeds/gente.xml')
|
||||
]
|
||||
|
||||
keep_only_tags = [
|
||||
dict(name='div', attrs={'class':['grouphead','date','art_head','story-texto','text','colC_articulo','contenido_comentarios']}),
|
||||
dict(name='div' , attrs={'id':['articulo','story-texto','story-entradilla']})
|
||||
]
|
||||
|
||||
remove_tags = [
|
||||
dict(name='div', attrs={'class':['art_barra','detalles-opinion','formdenunciar','modulo calculadoras','nubetags','pie']}),
|
||||
dict(name='div', attrs={'class':['mod_lomas','bloque_lomas','blm_header','link-app3','link-app4','botones_listado']}),
|
||||
dict(name='div', attrs={'class':['navegacion_galeria','modulocanalpromocion','separa','separacion','compartir','tags_relacionados']}),
|
||||
dict(name='div', attrs={'class':['moduloBuscadorDeportes','modulo-gente','moddestacadopeq','OpcArt','articulopiniones']}),
|
||||
dict(name='div', attrs={'class':['modulo-especial','publiEspecial']}),
|
||||
dict(name='div', attrs={'id':['articulopina']}),
|
||||
dict(name='br', attrs={'class':'clear'}),
|
||||
dict(name='form', attrs={'name':'frm_conversor2'})
|
||||
]
|
||||
|
||||
remove_tags_before = dict(name='div' , attrs={'class':'articulo '})
|
||||
remove_tags_after = dict(name='div' , attrs={'class':'comentarios'})
|
||||
|
||||
def get_cover_url(self):
|
||||
cover = None
|
||||
st = time.localtime()
|
||||
year = str(st.tm_year)
|
||||
month = "%.2d" % st.tm_mon
|
||||
day = "%.2d" % st.tm_mday
|
||||
#[url]http://img.kiosko.net/2011/01/02/es/elcorreo.750.jpg[/url]
|
||||
#[url]http://info.elcorreo.com/pdf/06012011-viz.pdf[/url]
|
||||
cover='http://info.elcorreo.com/pdf/'+ day + month + year +'-viz.pdf'
|
||||
|
||||
br = BasicNewsRecipe.get_browser()
|
||||
try:
|
||||
br.open(cover)
|
||||
except:
|
||||
self.log("\nPortada no disponible")
|
||||
cover ='http://www.elcorreo.com/vizcaya/noticias/201002/02/Media/logo-elcorreo-nuevo.png'
|
||||
return cover
|
||||
|
||||
extra_css = '''
|
||||
h1, .headline {font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:30px;}
|
||||
h2, .subhead {font-family:Arial,Helvetica,sans-serif; font-style:italic; font-weight:normal;font-size:18px;}
|
||||
h3, .overhead {font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:16px;}
|
||||
h4 {font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:16px;}
|
||||
h5 {font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:16px;}
|
||||
h6 {font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:16px;}
|
||||
.date,.byline, .photo {font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:14px;}
|
||||
img{margin-bottom: 0.4em}
|
||||
'''
|
||||
|
||||
|
||||
|
||||
preprocess_regexps = [
|
||||
|
||||
# To present the image of the embedded video
|
||||
(re.compile(r'var RUTA_IMAGEN', re.DOTALL|re.IGNORECASE), lambda match: '</script><img src'),
|
||||
(re.compile(r'.jpg";', re.DOTALL|re.IGNORECASE), lambda match: '.jpg">'),
|
||||
(re.compile(r'var SITIO = "elcorreo";', re.DOTALL|re.IGNORECASE), lambda match: '<SCRIPT TYPE="text/JavaScript"'),
|
||||
|
||||
# To separate paragraphs with a blank line
|
||||
(re.compile(r'<div class="p"', re.DOTALL|re.IGNORECASE), lambda match: '<p></p><div class="p"'),
|
||||
|
||||
# To put a blank line between the subtitle and the date and time of the news
|
||||
(re.compile(r'<div class="date">', re.DOTALL|re.IGNORECASE), lambda match: '<br><div class="date">'),
|
||||
|
||||
# To put a blank line between the intro of the embedded videos and the previous text
|
||||
(re.compile(r'<div class="video"', re.DOTALL|re.IGNORECASE), lambda match: '<br><div class="video"'),
|
||||
|
||||
# To view photos from the first when these are presented as a gallery
|
||||
(re.compile(r'src="/img/shim.gif"', re.DOTALL|re.IGNORECASE), lambda match: ''),
|
||||
(re.compile(r'rel=', re.DOTALL|re.IGNORECASE), lambda match: 'src='),
|
||||
|
||||
# To remove the link of the title
|
||||
(re.compile(r'<h1 class="headline">\n<a href="', re.DOTALL|re.IGNORECASE), lambda match: '<h1 class="'),
|
||||
(re.compile(r'</a>\n</h1>', re.DOTALL|re.IGNORECASE), lambda match: '</h1>'),
|
||||
|
||||
]
|
||||
|
43
resources/recipes/el_publico.recipe
Normal file
43
resources/recipes/el_publico.recipe
Normal file
@ -0,0 +1,43 @@
|
||||
#!/usr/bin/env python
|
||||
__license__ = 'GPL v3'
|
||||
__author__ = 'Gerardo Diez'
|
||||
__copyright__ = 'Gerardo Diez<gerardo.diez.garcia@gmail.com>'
|
||||
description = 'Main daily newspaper from Spain - v1.00 (05, Enero 2011)'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
'''
|
||||
publico.es
|
||||
'''
|
||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||
class Publico(BasicNewsRecipe):
|
||||
title =u'Publico.es'
|
||||
__author__ ='Gerardo Diez'
|
||||
publisher =u'Mediapubli Sociedad de Publicaciones y Ediciones S.L.'
|
||||
category ='news, politics, finances, world, spain, science, catalunya'
|
||||
oldest_article =1
|
||||
max_articles_per_feed =100
|
||||
simultaneous_downloads =10
|
||||
cover_url =u'http://imagenes.publico.es/css/img/logo_publico.gif'
|
||||
timefmt ='[%a, %d %b, %Y]'
|
||||
encoding ='utf8'
|
||||
language ='es'
|
||||
remove_javascript =True
|
||||
no_stylesheets =True
|
||||
keep_only_tags =dict(id='main')
|
||||
remove_tags =[
|
||||
dict(name='div', attrs={'class':['Noticias_642x50', 'contInfo ancho']}),
|
||||
dict(name='ul', attrs={'class':['navComentarios', 'comentarios']}),
|
||||
dict(name='div', attrs={'id':['commentsContext', 'toolbar', 'comentarios']}),
|
||||
dict(name='h5', attrs={'id':'comentarios'})
|
||||
]
|
||||
feeds =[(u'Internacional', u'http://www.publico.es/estaticos/rss/internacional'),
|
||||
(u'Espa\xf1a', u'http://www.publico.es/estaticos/rss/espana'),
|
||||
(u'Dinero', u'http://www.publico.es/estaticos/rss/dinero'),
|
||||
(u'Ciencias', u'http://www.publico.es/estaticos/rss/ciencias'),
|
||||
(u'Culturas', u'http://www.publico.es/estaticos/rss/culturas'),
|
||||
(u'Deportes', u'http://www.publico.es/estaticos/rss/deportes'),
|
||||
(u'Televisi\xf3n y Gente', u'http://www.publico.es/estaticos/rss/televisionygente'),
|
||||
(u'Catalu\xf1a', u'http://www.publico.es/estaticos/rss/catalunya'),
|
||||
(u'Viajes', u'http://www.publico.es/estaticos/rss/viajes')]
|
||||
|
||||
|
@ -1,7 +1,5 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
|
||||
__copyright__ = '2009-2010, Darko Miletic <darko.miletic at gmail.com>'
|
||||
'''
|
||||
eluniversal.com.mx
|
||||
'''
|
||||
@ -18,75 +16,25 @@ class ElUniversal(BasicNewsRecipe):
|
||||
category = 'news, politics, Mexico'
|
||||
no_stylesheets = True
|
||||
use_embedded_content = False
|
||||
encoding = 'cp1252'
|
||||
encoding = 'utf8'
|
||||
remove_javascript = True
|
||||
language = 'es'
|
||||
remove_empty_feeds = True
|
||||
publication_type = 'newspaper'
|
||||
language = 'es'
|
||||
|
||||
extra_css = '''
|
||||
body{font-family:Arial,Helvetica,sans-serif; font-size:x-small;}
|
||||
.geoGris30{font-family:Georgia,"Times New Roman",Times,serif; font-size:large; color:#003366; font-weight:bold;}
|
||||
.arnegro16{font-family:Georgia,"Times New Roman",Times,serif; font-weight:bold; font-size:small;}
|
||||
.tbazull2{font-family:"trebuchet ms",Arial,Helvetica,sans-serif; color:#336699; font-size:xx-small;}
|
||||
.tbgrisf11{font-family:"trebuchet ms",Arial,Helvetica,sans-serif; color: #666666; font-size:xx-small;}
|
||||
.verrojo13{font-family:"trebuchet ms",Arial,Helvetica,sans-serif; color: #CC0033; font-size:xx-small;}
|
||||
.trnegro13{font-family:"trebuchet ms",Arial,Helvetica,sans-serif; font-size:xx-small;}
|
||||
.txt-fotogaleria{font-family:"trebuchet ms",Arial,Helvetica,sans-serif; font-size:xx-small;}
|
||||
body{font-family:Arial,Helvetica,sans-serif}
|
||||
.noteTitle{font-family: Georgia,"Times New Roman",Times,serif; color: #336699; font-size: xx-large; font-weight: bold}
|
||||
.noteInfo{display: block; color: gray}
|
||||
'''
|
||||
keep_only_tags = [ dict(name='table', attrs={'width':"633"}),dict(name='table', attrs={'width':"629"}),]
|
||||
|
||||
keep_only_tags = [ dict(name='div', attrs={'id':'noteContent'})]
|
||||
remove_tags_after = dict(attrs={'class':'noteText'})
|
||||
remove_tags = [
|
||||
dict(name='table', attrs={'bgcolor':"#f5f5f5"}),
|
||||
dict(name='td', attrs={'bgcolor':"#f7f8f9"}),
|
||||
dict(name='td', attrs={'bgcolor':"#f5f5f5"}),
|
||||
dict(name='table', attrs={'width':"302"}),
|
||||
dict(name='table', attrs={'width':"214"}),
|
||||
dict(name='table', attrs={'width':"112"}),
|
||||
dict(name='table', attrs={'width':"980"}),
|
||||
dict(name='td', attrs={'height':"1"}),
|
||||
dict(name='td', attrs={'height':"4"}),
|
||||
dict(name='td', attrs={'height':"20"}),
|
||||
dict(name='td', attrs={'height':"10"}),
|
||||
dict(name='td', attrs={'class':["trrojo11","trbris11","trrojo12","arrojo12s","tbazul13"]}),
|
||||
dict(name='div', attrs={'id':["mapg","ver_off_todosloscom","todosloscom"]}),
|
||||
dict(name='span', attrs={'class':["trazul18b","trrojo11","trnaranja11","trbris11","georojo18b","geogris18"]}),
|
||||
dict(name='span', attrs={'class':["detalles-opinion"]}),
|
||||
dict(name='a', attrs={'class':["arnaranja12b","trbris11","arazul12rel","trrojo10"]}),
|
||||
dict(name='img', src = "/img/icono_imprimir.gif"),
|
||||
dict(name='img', src = "/img/icono_enviar_mail.gif"),
|
||||
dict(name='img', src = "/img/icono_fuente_g.gif"),
|
||||
dict(name='img', src = "/img/icono_fuente_m.gif"),
|
||||
dict(name='img', src = "/img/icono_fuente_c.gif"),
|
||||
dict(name='img', src = "/img/icono_compartir.gif"),
|
||||
dict(name='img', src = "/img/icono_enviar_coment.gif"),
|
||||
dict(name='img', src = "http://www.eluniversal.com.mx/n_img/bot-notasrel.gif"),
|
||||
dict(name='img', src = "http://www.eluniversal.com.mx/n_img/fr.gif"),
|
||||
dict(name='img', src = "/img/espiral2.gif"),
|
||||
dict(name='img', src = "http://www.eluniversal.com.mx/n_img/b"),
|
||||
dict(name='img', src = "/img/icono_enviar_coment.gifot-notasrel.gif"),
|
||||
dict(name='img', src = "/n_img/icono_tipo3.gif"),
|
||||
dict(name='img', src = "/n_img/icono_tipo2.gif"),
|
||||
dict(name='img', src = "/n_img/icono_print.gif"),
|
||||
dict(name='img', src = "/n_img/icono_mail2.gif"),
|
||||
dict(name='img', src = "/n_img/im-comentarios-2a.gif"),
|
||||
dict(name='img', src = "/n_img/im-comentarios-1a.gif"),
|
||||
dict(name='img', src = "/img/icono_coment.gif"),
|
||||
dict(name='img', src = "http://www.eluniversal.com.mx/n_img/bot-sitiosrel.gif"),
|
||||
dict(name='img', src = "/n_img/icono_tipomenos.gif"),
|
||||
dict(name='img', src = "/img/futbol/19.jpg"),
|
||||
dict(name='img', alt = "Facebook"),
|
||||
dict(name='img', alt = "Twitter"),
|
||||
dict(name='img', alt = "Google"),
|
||||
dict(name='img', alt = "LinkedIn"),
|
||||
dict(name='img', alt = "Viadeo"),
|
||||
dict(name='img', alt = "Digg"),
|
||||
dict(name='img', alt = "Delicious"),
|
||||
dict(name='img', alt = "Meneame"),
|
||||
dict(name='img', alt = "Yahoo"),
|
||||
dict(name='img', alt = "Technorati"),
|
||||
dict(name='a',text =["Compartir","Facebook","Twitter","Google","LinkedIn","Viadeo","Digg","Delicious","Meneame","Yahoo","Technorati"]),
|
||||
dict(name='select'),
|
||||
dict(name='a', attrs={'class':"tbgriscompartir"}),
|
||||
]
|
||||
dict(attrs={'class':'noteExtras'}),
|
||||
dict(name=['meta','iframe','base','embed','object']),
|
||||
dict(attrs={'id':'tm_box'})
|
||||
]
|
||||
remove_attributes=['lang','onclick']
|
||||
|
||||
feeds = [
|
||||
(u'Minuto por Minuto', u'http://www.eluniversal.com.mx/rss/universalmxm.xml' )
|
||||
@ -101,25 +49,3 @@ class ElUniversal(BasicNewsRecipe):
|
||||
,(u'Computacion' , u'http://www.eluniversal.com.mx/rss/computo.xml' )
|
||||
,(u'Sociedad' , u'http://www.eluniversal.com.mx/rss/sociedad.xml' )
|
||||
]
|
||||
|
||||
# def print_version(self, url):
|
||||
# return url.replace('/notas/','/notas/vi_')
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
mtag = '<meta http-equiv="Content-Language" content="es-MX"/><meta http-equiv="Content-Type" content="text/html; charset=utf-8">'
|
||||
soup.head.insert(0,mtag)
|
||||
for tag in soup.findAll(name='td',attrs={'class': 'arazul50'}):
|
||||
tag.insert(0,"<h1>")
|
||||
tag.insert(2,"</h1>")
|
||||
|
||||
return soup
|
||||
|
||||
def postprocess_html(self, soup,first):
|
||||
|
||||
for tag in soup.findAll(name=['table', 'span','i']):
|
||||
tag.name = 'div'
|
||||
for item in soup.findAll(align = "right"):
|
||||
del item['align']
|
||||
|
||||
return soup
|
||||
|
||||
|
@ -17,7 +17,7 @@ class ElPais_RSS(BasicNewsRecipe):
|
||||
no_stylesheets = True
|
||||
encoding = 'cp1252'
|
||||
use_embedded_content = False
|
||||
language = 'es_ES'
|
||||
language = 'es'
|
||||
remove_empty_feeds = True
|
||||
publication_type = 'newspaper'
|
||||
masthead_url = 'http://www.elpais.com/im/tit_logo.gif'
|
||||
@ -57,14 +57,14 @@ class ElPais_RSS(BasicNewsRecipe):
|
||||
,(u'Madrid' , u'http://www.elpais.com/rss/feed.html?feedId=1016' )
|
||||
,(u'Pais Vasco' , u'http://www.elpais.com/rss/feed.html?feedId=17062')
|
||||
,(u'Galicia' , u'http://www.elpais.com/rss/feed.html?feedId=17063')
|
||||
,(u'Opinion' , u'http://www.elpais.com/rss/feed.html?feedId=1003' )
|
||||
,(u'Sociedad' , u'http://www.elpais.com/rss/feed.html?feedId=1004' )
|
||||
,(u'Opinion' , u'http://www.elpais.com/rss/feed.html?feedId=1003' )
|
||||
,(u'Sociedad' , u'http://www.elpais.com/rss/feed.html?feedId=1004' )
|
||||
,(u'Deportes' , u'http://www.elpais.com/rss/feed.html?feedId=1007' )
|
||||
,(u'Cultura' , u'http://www.elpais.com/rss/feed.html?feedId=1008' )
|
||||
,(u'Cine' , u'http://www.elpais.com/rss/feed.html?feedId=17052')
|
||||
,(u'Literatura' , u'http://www.elpais.com/rss/feed.html?feedId=17053')
|
||||
,(u'Musica' , u'http://www.elpais.com/rss/feed.html?feedId=17051')
|
||||
,(u'Arte' , u'http://www.elpais.com/rss/feed.html?feedId=17060')
|
||||
,(u'Arte' , u'http://www.elpais.com/rss/feed.html?feedId=17060')
|
||||
,(u'Tecnologia' , u'http://www.elpais.com/rss/feed.html?feedId=1005' )
|
||||
,(u'Economia' , u'http://www.elpais.com/rss/feed.html?feedId=1006' )
|
||||
,(u'Ciencia' , u'http://www.elpais.com/rss/feed.html?feedId=17068')
|
||||
|
@ -1,7 +1,5 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
|
||||
__copyright__ = '2009-2011, Darko Miletic <darko.miletic at gmail.com>'
|
||||
'''
|
||||
exiledonline.com
|
||||
'''
|
||||
@ -20,18 +18,20 @@ class Exiled(BasicNewsRecipe):
|
||||
use_embedded_content = False
|
||||
encoding = 'utf8'
|
||||
remove_javascript = True
|
||||
language = 'en'
|
||||
|
||||
cover_url = 'http://exiledonline.com/wp-content/themes/exiledonline_theme/images/header-sm.gif'
|
||||
|
||||
html2lrf_options = [
|
||||
'--comment' , description
|
||||
, '--base-font-size', '10'
|
||||
, '--category' , category
|
||||
, '--publisher' , publisher
|
||||
]
|
||||
|
||||
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
|
||||
language = 'en'
|
||||
publication_type = 'newsblog'
|
||||
masthead_url = 'http://exiledonline.com/wp-content/themes/exiledonline_theme/images/header-sm.gif'
|
||||
extra_css = """
|
||||
body{font-family: Arial,Helvetica,sans-serif}
|
||||
#topslug{font-size: xx-large; font-weight: bold; color: red}
|
||||
"""
|
||||
|
||||
conversion_options = {
|
||||
'comment' : description
|
||||
, 'tags' : category
|
||||
, 'publisher' : publisher
|
||||
, 'language' : language
|
||||
}
|
||||
|
||||
keep_only_tags = [dict(name='div', attrs={'id':'main'})]
|
||||
|
||||
@ -47,12 +47,13 @@ class Exiled(BasicNewsRecipe):
|
||||
def preprocess_html(self, soup):
|
||||
for item in soup.findAll(style=True):
|
||||
del item['style']
|
||||
mtag = '\n<meta http-equiv="Content-Language" content="en"/>\n<meta http-equiv="Content-Type" content="text/html; charset=utf-8">\n'
|
||||
soup.head.insert(0,mtag)
|
||||
for alink in soup.findAll('a'):
|
||||
if alink.string is not None:
|
||||
tstr = alink.string
|
||||
alink.replaceWith(tstr)
|
||||
return soup
|
||||
|
||||
def get_article_url(self, article):
|
||||
raw = article.get('link', None)
|
||||
final = raw + 'all/1/'
|
||||
return final
|
||||
|
||||
|
@ -1,59 +1,79 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
|
||||
__author__ = 'Gerardo Diez'
|
||||
__copyright__ = 'Gerardo Diez<gerardo.diez.garcia@gmail.com>'
|
||||
description = 'Main daily newspaper from Spain - v1.00 (05, Enero 2011)'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
'''
|
||||
www.expansion.com
|
||||
expansion.es
|
||||
'''
|
||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||
class Publico(BasicNewsRecipe):
|
||||
title =u'Expansion.com'
|
||||
__author__ ='Gerardo Diez'
|
||||
publisher =u'Unidad Editorial Información Económica, S.L.'
|
||||
category ='finances, catalunya'
|
||||
oldest_article =1
|
||||
max_articles_per_feed =100
|
||||
simultaneous_downloads =10
|
||||
cover_url =u'http://estaticos01.expansion.com/iconos/v2.x/v2.0/cabeceras/logo_expansion.png'
|
||||
timefmt ='[%A, %d %B, %Y]'
|
||||
encoding ='latin'
|
||||
language ='es'
|
||||
remove_javascript =True
|
||||
no_stylesheets =True
|
||||
keep_only_tags =dict(name='div', attrs={'class':['noticia primer_elemento']})
|
||||
remove_tags =[
|
||||
dict(name='div', attrs={'class':['compartir', 'metadata_desarrollo_noticia', 'relacionadas', 'mas_info','publicidad publicidad_textlink', 'ampliarfoto']}),
|
||||
dict(name='ul', attrs={'class':['bolos_desarrollo_noticia']}),
|
||||
dict(name='span', attrs={'class':['comentarios']}),
|
||||
dict(name='p', attrs={'class':['cintillo_comentarios', 'cintillo_comentarios formulario']}),
|
||||
dict(name='div', attrs={'id':['comentarios_lectores_listado']})
|
||||
]
|
||||
feeds =[
|
||||
(u'Portada', u'http://estaticos.expansion.com/rss/portada.xml'),
|
||||
(u'Portada: Bolsas', u'http://estaticos.expansion.com/rss/mercados.xml'),
|
||||
(u'Divisas', u'http://estaticos.expansion.com/rss/mercadosdivisas.xml'),
|
||||
(u'Euribor', u'http://estaticos.expansion.com/rss/mercadoseuribor.xml'),
|
||||
(u'Materias Primas', u'http://estaticos.expansion.com/rss/mercadosmateriasprimas.xml'),
|
||||
(u'Renta Fija', u'http://estaticos.expansion.com/rss/mercadosrentafija.xml'),
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
from calibre.ebooks.BeautifulSoup import Tag
|
||||
(u'Portada: Mi Dinero', u'http://estaticos.expansion.com/rss/midinero.xml'),
|
||||
(u'Hipotecas', u'http://estaticos.expansion.com/rss/midinerohipotecas.xml'),
|
||||
(u'Créditos', u'http://estaticos.expansion.com/rss/midinerocreditos.xml'),
|
||||
(u'Pensiones', u'http://estaticos.expansion.com/rss/midineropensiones.xml'),
|
||||
(u'Fondos de Inversión', u'http://estaticos.expansion.com/rss/midinerofondos.xml'),
|
||||
(u'Motor', u'http://estaticos.expansion.com/rss/midineromotor.xml'),
|
||||
|
||||
class Expansion(BasicNewsRecipe):
|
||||
title = 'Diario Expansion'
|
||||
__author__ = 'Darko Miletic'
|
||||
description = 'Lider de informacion de mercados, economica y politica'
|
||||
publisher = 'expansion.com'
|
||||
category = 'news, politics, Spain'
|
||||
oldest_article = 2
|
||||
max_articles_per_feed = 100
|
||||
no_stylesheets = True
|
||||
use_embedded_content = False
|
||||
delay = 1
|
||||
encoding = 'iso-8859-15'
|
||||
language = 'es'
|
||||
(u'Portada: Empresas', u'http://estaticos.expansion.com/rss/empresas.xml'),
|
||||
(u'Banca', u'http://estaticos.expansion.com/rss/empresasbanca.xml'),
|
||||
(u'TMT', u'http://estaticos.expansion.com/rss/empresastmt.xml'),
|
||||
(u'Energía', u'http://estaticos.expansion.com/rss/empresasenergia.xml'),
|
||||
(u'Inmobiliario y Construcción', u'http://estaticos.expansion.com/rss/empresasinmobiliario.xml'),
|
||||
(u'Transporte y Turismo', u'http://estaticos.expansion.com/rss/empresastransporte.xml'),
|
||||
(u'Automoción e Industria', u'http://estaticos.expansion.com/rss/empresasauto-industria.xml'),
|
||||
(u'Distribución', u'http://estaticos.expansion.com/rss/empresasdistribucion.xml'),
|
||||
(u'Deporte y Negocio', u' http://estaticos.expansion.com/rss/empresasdeporte.xml'),
|
||||
(u'Mi Negocio', u'http://estaticos.expansion.com/rss/empresasminegocio.xml'),
|
||||
(u'Interiores', u'http://estaticos.expansion.com/rss/empresasinteriores.xml'),
|
||||
(u'Digitech', u'http://estaticos.expansion.com/rss/empresasdigitech.xml'),
|
||||
|
||||
direction = 'ltr'
|
||||
(u'Portada: Economía y Política', u'http://estaticos.expansion.com/rss/economiapolitica.xml'),
|
||||
(u'Política', u'http://estaticos.expansion.com/rss/economia.xml'),
|
||||
(u'Portada: Sociedad', u'http://estaticos.expansion.com/rss/entorno.xml'),
|
||||
|
||||
html2lrf_options = [
|
||||
'--comment' , description
|
||||
, '--category' , category
|
||||
, '--publisher', publisher
|
||||
]
|
||||
(u'Portada: Opinión', u'http://estaticos.expansion.com/rss/opinion.xml'),
|
||||
(u'Llaves y editoriales', u'http://estaticos.expansion.com/rss/opinioneditorialyllaves.xml'),
|
||||
(u'Tribunas', u'http://estaticos.expansion.com/rss/opiniontribunas.xml'),
|
||||
|
||||
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
|
||||
(u'Portada: Jurídico', u'http://estaticos.expansion.com/rss/juridico.xml'),
|
||||
(u'Entrevistas', u'http://estaticos.expansion.com/rss/juridicoentrevistas.xml'),
|
||||
(u'Opinión', u'http://estaticos.expansion.com/rss/juridicoopinion.xml'),
|
||||
(u'Sentencias', u'http://estaticos.expansion.com/rss/juridicosentencias.xml'),
|
||||
|
||||
feeds = [
|
||||
(u'Ultimas noticias', u'http://rss.expansion.com/rss/descarga.htm?data2=178')
|
||||
,(u'Temas del dia' , u'http://rss.expansion.com/rss/descarga.htm?data2=178')
|
||||
]
|
||||
|
||||
|
||||
keep_only_tags = [dict(name='div', attrs={'id':'principal'})]
|
||||
|
||||
remove_tags = [
|
||||
dict(name=['object','link','script'])
|
||||
,dict(name='div', attrs={'class':['utilidades','tit_relacionadas']})
|
||||
]
|
||||
|
||||
remove_tags_after = [dict(name='div', attrs={'class':'tit_relacionadas'})]
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
soup.html['dir' ] = self.direction
|
||||
mcharset = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset=utf-8")])
|
||||
soup.head.insert(0,mcharset)
|
||||
for item in soup.findAll(style=True):
|
||||
del item['style']
|
||||
return soup
|
||||
(u'Mujer', u'http://estaticos.expansion.com/rss/mujer-empresa.xml'),
|
||||
(u'Cataluña', u'http://estaticos.expansion.com/rss/catalunya.xml'),
|
||||
(u'Función pública', u'http://estaticos.expansion.com/rss/funcion-publica.xml')
|
||||
]
|
||||
|
||||
|
@ -1,5 +1,5 @@
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>'
|
||||
__copyright__ = '2010-2011, Darko Miletic <darko.miletic at gmail.com>'
|
||||
'''
|
||||
ft.com
|
||||
'''
|
||||
@ -52,22 +52,38 @@ class FinancialTimes(BasicNewsRecipe):
|
||||
.copyright{font-size: x-small}
|
||||
"""
|
||||
|
||||
def parse_index(self):
|
||||
def get_artlinks(self, elem):
|
||||
articles = []
|
||||
for item in elem.findAll('a',href=True):
|
||||
url = self.PREFIX + item['href']
|
||||
title = self.tag_to_string(item)
|
||||
date = strftime(self.timefmt)
|
||||
articles.append({
|
||||
'title' :title
|
||||
,'date' :date
|
||||
,'url' :url
|
||||
,'description':''
|
||||
})
|
||||
return articles
|
||||
|
||||
def parse_index(self):
|
||||
feeds = []
|
||||
soup = self.index_to_soup(self.INDEX)
|
||||
wide = soup.find('div',attrs={'class':'wide'})
|
||||
if wide:
|
||||
for item in wide.findAll('a',href=True):
|
||||
url = self.PREFIX + item['href']
|
||||
title = self.tag_to_string(item)
|
||||
date = strftime(self.timefmt)
|
||||
articles.append({
|
||||
'title' :title
|
||||
,'date' :date
|
||||
,'url' :url
|
||||
,'description':''
|
||||
})
|
||||
return [('FT UK edition',articles)]
|
||||
if not wide:
|
||||
return feeds
|
||||
strest = wide.findAll('h3', attrs={'class':'section'})
|
||||
if not strest:
|
||||
return feeds
|
||||
st = wide.find('h4',attrs={'class':'section-no-arrow'})
|
||||
if st:
|
||||
strest.insert(0,st)
|
||||
for item in strest:
|
||||
ftitle = self.tag_to_string(item)
|
||||
self.report_progress(0, _('Fetching feed')+' %s...'%(ftitle))
|
||||
feedarts = self.get_artlinks(item.parent.ul)
|
||||
feeds.append((ftitle,feedarts))
|
||||
return feeds
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
return self.adeify_images(soup)
|
||||
|
@ -1,4 +1,5 @@
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
from calibre.ebooks.BeautifulSoup import Tag
|
||||
import re
|
||||
|
||||
class NatureNews(BasicNewsRecipe):
|
||||
@ -10,17 +11,76 @@ class NatureNews(BasicNewsRecipe):
|
||||
max_articles_per_feed = 50
|
||||
|
||||
no_stylesheets = True
|
||||
remove_tags_before = dict(name='h1', attrs={'class':'heading entry-title'})
|
||||
remove_tags_after = dict(name='h2', attrs={'id':'comments'})
|
||||
keep_only_tags = [dict(name='div', attrs={'id':'content'})]
|
||||
# remove_tags_before = dict(name='h1', attrs={'class':'heading entry-title'})
|
||||
# remove_tags_after = dict(name='h2', attrs={'id':'comments'})
|
||||
remove_tags = [
|
||||
dict(name='h2', attrs={'id':'comments'}),
|
||||
dict(attrs={'alt':'Advertisement'}),
|
||||
dict(name='div', attrs={'class':'ad'}),
|
||||
]
|
||||
dict(attrs={'class':'Z3988'}),
|
||||
dict(attrs={'class':['formatpublished','type-of-article','cleardiv','disclaimer','buttons','comments xoxo']}),
|
||||
dict(name='a', attrs={'href':'#comments'}),
|
||||
dict(name='h2',attrs={'class':'subheading plusicon icon-add-comment'})
|
||||
]
|
||||
|
||||
preprocess_regexps = [
|
||||
(re.compile(r'<p>ADVERTISEMENT</p>', re.DOTALL|re.IGNORECASE), lambda match: ''),
|
||||
]
|
||||
|
||||
extra_css = '''
|
||||
.author { text-align: right; font-size: small; line-height:1em; margin-top:0px; margin-left:0; margin-right:0; margin-bottom: 0; }
|
||||
.imagedescription { font-size: small; font-style:italic; line-height:1em; margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; }
|
||||
.imagecredit { font-size: x-small; font-style: normal; font-weight: bold}
|
||||
'''
|
||||
|
||||
feeds = [('Nature News', 'http://feeds.nature.com/news/rss/most_recent')]
|
||||
|
||||
def preprocess_html(self,soup):
|
||||
# The author name is slightly buried - dig it up
|
||||
author = soup.find('p', {'class':'byline'})
|
||||
if author:
|
||||
# Find out the author's name
|
||||
authornamediv = author.find('span',{'class':'author fn'})
|
||||
authornamelink = authornamediv.find('a')
|
||||
if authornamelink:
|
||||
authorname = authornamelink.contents[0]
|
||||
else:
|
||||
authorname = authornamediv.contents[0]
|
||||
# Stick the author's name in the byline tag
|
||||
tag = Tag(soup,'div')
|
||||
tag['class'] = 'author'
|
||||
tag.insert(0,authorname.strip())
|
||||
author.replaceWith(tag)
|
||||
|
||||
# Change the intro from a p to a div
|
||||
intro = soup.find('p',{'class':'intro'})
|
||||
if intro:
|
||||
tag = Tag(soup,'div')
|
||||
tag['class'] = 'intro'
|
||||
tag.insert(0,intro.contents[0])
|
||||
intro.replaceWith(tag)
|
||||
|
||||
# Change span class=imagedescription to div
|
||||
descr = soup.find('span',{'class':'imagedescription'})
|
||||
if descr:
|
||||
tag = Tag(soup,'div')
|
||||
tag['class'] = 'imagedescription'
|
||||
tag.insert(0,descr.renderContents())
|
||||
descr.replaceWith(tag)
|
||||
|
||||
# The references are in a list, let's make them simpler
|
||||
reflistcont = soup.find('ul',{'id':'article-refrences'})
|
||||
if reflistcont:
|
||||
reflist = reflistcont.li.renderContents()
|
||||
tag = Tag(soup,'div')
|
||||
tag['class'] = 'article-references'
|
||||
tag.insert(0,reflist)
|
||||
reflistcont.replaceWith(tag)
|
||||
|
||||
# Within the id=content div, we need to remove all the stuff after the end of the class=entry-content
|
||||
entrycontent = soup.find('div',{'class':'entry-content'})
|
||||
for nextSibling in entrycontent.findNextSiblings():
|
||||
nextSibling.extract()
|
||||
|
||||
return soup
|
||||
|
@ -8,12 +8,13 @@ __docformat__ = 'restructuredtext en'
|
||||
globeandmail.com
|
||||
'''
|
||||
|
||||
import re
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class AdvancedUserRecipe1287083651(BasicNewsRecipe):
|
||||
title = u'Globe & Mail'
|
||||
__license__ = 'GPL v3'
|
||||
__author__ = 'Szing'
|
||||
__author__ = 'Kovid Goyal'
|
||||
oldest_article = 2
|
||||
no_stylesheets = True
|
||||
max_articles_per_feed = 100
|
||||
@ -38,24 +39,19 @@ class AdvancedUserRecipe1287083651(BasicNewsRecipe):
|
||||
(u'Sports', u'http://www.theglobeandmail.com/auto/?service=rss')
|
||||
]
|
||||
|
||||
keep_only_tags = [
|
||||
dict(name='h1'),
|
||||
dict(name='h2', attrs={'id':'articletitle'}),
|
||||
dict(name='p', attrs={'class':['leadText', 'meta', 'leadImage', 'redtext byline', 'bodyText']}),
|
||||
dict(name='div', attrs={'class':['news','articlemeta','articlecopy']}),
|
||||
dict(name='id', attrs={'class':'article'}),
|
||||
dict(name='table', attrs={'class':'todays-market'}),
|
||||
dict(name='header', attrs={'id':'leadheader'})
|
||||
]
|
||||
preprocess_regexps = [
|
||||
(re.compile(r'<head.*?</head>', re.DOTALL), lambda m: '<head></head>'),
|
||||
(re.compile(r'<script.*?</script>', re.DOTALL), lambda m: ''),
|
||||
]
|
||||
|
||||
remove_tags_before = dict(name='h1')
|
||||
remove_tags = [
|
||||
dict(name='div', attrs={'id':['tabInside', 'ShareArticles', 'topStories']})
|
||||
]
|
||||
|
||||
#this has to be here or the text in the article appears twice.
|
||||
remove_tags_after = [dict(id='article')]
|
||||
dict(name='div', attrs={'id':['ShareArticles', 'topStories']}),
|
||||
dict(href=lambda x: x and 'tracking=' in x),
|
||||
{'class':['articleTools', 'pagination', 'Ads', 'topad',
|
||||
'breadcrumbs', 'footerNav', 'footerUtil', 'downloadlinks']}]
|
||||
|
||||
#Use the mobile version rather than the web version
|
||||
def print_version(self, url):
|
||||
return url + '&service=mobile'
|
||||
return url.rpartition('?')[0] + '?service=mobile'
|
||||
|
||||
|
@ -3,29 +3,31 @@ __license__ = 'GPL v3'
|
||||
__copyright__ = '04 December 2010, desUBIKado'
|
||||
__author__ = 'desUBIKado'
|
||||
__description__ = 'Daily newspaper from Aragon'
|
||||
__version__ = 'v0.03'
|
||||
__date__ = '11, December 2010'
|
||||
__version__ = 'v0.04'
|
||||
__date__ = '6, Januery 2011'
|
||||
'''
|
||||
[url]http://www.heraldo.es/[/url]
|
||||
'''
|
||||
|
||||
import time
|
||||
import re
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class heraldo(BasicNewsRecipe):
|
||||
__author__ = 'desUBIKado'
|
||||
description = 'Daily newspaper from Aragon'
|
||||
__author__ = 'desUBIKado'
|
||||
description = 'Daily newspaper from Aragon'
|
||||
title = u'Heraldo de Aragon'
|
||||
publisher = 'OJD Nielsen'
|
||||
category = 'News, politics, culture, economy, general interest'
|
||||
language = 'es'
|
||||
timefmt = '[%a, %d %b, %Y]'
|
||||
oldest_article = 1
|
||||
oldest_article = 2
|
||||
delay = 1
|
||||
max_articles_per_feed = 100
|
||||
use_embedded_content = False
|
||||
remove_javascript = True
|
||||
no_stylesheets = True
|
||||
recursion = 10
|
||||
|
||||
|
||||
feeds = [
|
||||
(u'Portadas', u'http://www.heraldo.es/index.php/mod.portadas/mem.rss')
|
||||
@ -37,29 +39,39 @@ class heraldo(BasicNewsRecipe):
|
||||
|
||||
remove_tags = [dict(name='a', attrs={'class':['com flo-r','enl-if','enl-df']}),
|
||||
dict(name='div', attrs={'class':['brb-b-s con marg-btt','cnt-rel con']}),
|
||||
dict(name='form', attrs={'class':'form'})]
|
||||
dict(name='form', attrs={'class':'form'}),
|
||||
dict(name='ul', attrs={'id':['cont-tags','pag-1']})]
|
||||
|
||||
remove_tags_before = dict(name='div' , attrs={'id':'dts'})
|
||||
remove_tags_after = dict(name='div' , attrs={'id':'com'})
|
||||
|
||||
def get_cover_url(self):
|
||||
cover = None
|
||||
st = time.localtime()
|
||||
year = str(st.tm_year)
|
||||
month = "%.2d" % st.tm_mon
|
||||
day = "%.2d" % st.tm_mday
|
||||
cover = None
|
||||
st = time.localtime()
|
||||
year = str(st.tm_year)
|
||||
month = "%.2d" % st.tm_mon
|
||||
day = "%.2d" % st.tm_mday
|
||||
#[url]http://oldorigin-www.heraldo.es/20101211/primeras/portada_aragon.pdf[/url]
|
||||
cover='http://oldorigin-www.heraldo.es/'+ year + month + day +'/primeras/portada_aragon.pdf'
|
||||
br = BasicNewsRecipe.get_browser()
|
||||
try:
|
||||
br.open(cover)
|
||||
except:
|
||||
self.log("\nPortada no disponible")
|
||||
cover ='http://www.heraldo.es/MODULOS/global/publico/interfaces/img/logo-Heraldo.png'
|
||||
return cover
|
||||
|
||||
cover='http://oldorigin-www.heraldo.es/'+ year + month + day +'/primeras/portada_aragon.pdf'
|
||||
br = BasicNewsRecipe.get_browser()
|
||||
try:
|
||||
br.open(cover)
|
||||
except:
|
||||
self.log("\nPortada no disponible")
|
||||
cover ='http://www.heraldo.es/MODULOS/global/publico/interfaces/img/logo-Heraldo.png'
|
||||
return cover
|
||||
|
||||
|
||||
extra_css = '''
|
||||
h2{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:xx-large;}
|
||||
'''
|
||||
.con strong{font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:16px;}
|
||||
.con h2{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:30px;}
|
||||
.con span{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:12px;}
|
||||
.ent {font-family:Arial,Helvetica,sans-serif; font-weight:normal; font-style:italic; font-size:18px;}
|
||||
img{margin-bottom: 0.4em}
|
||||
'''
|
||||
|
||||
preprocess_regexps = [
|
||||
|
||||
# To separate the comments with a blank line
|
||||
(re.compile(r'<div id="com"', re.DOTALL|re.IGNORECASE), lambda match: '<br><div id="com"')
|
||||
]
|
||||
|
23
resources/recipes/ibm_smarter_planet.recipe
Normal file
23
resources/recipes/ibm_smarter_planet.recipe
Normal file
@ -0,0 +1,23 @@
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class AdvancedUserRecipe1293122276(BasicNewsRecipe):
|
||||
title = u'Smarter Planet | Tumblr for eReaders'
|
||||
__author__ = 'Jack Mason'
|
||||
author = 'IBM Global Business Services'
|
||||
publisher = 'IBM'
|
||||
category = 'news, technology, IT, internet of things, analytics'
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 30
|
||||
no_stylesheets = True
|
||||
use_embedded_content = False
|
||||
masthead_url = 'http://30.media.tumblr.com/tumblr_l70dow9UmU1qzs4rbo1_r3_250.jpg'
|
||||
remove_tags_before = dict(id='item')
|
||||
remove_tags_after = dict(id='item')
|
||||
remove_tags = [dict(attrs={'class':['sidebar', 'about', 'footer', 'description,' 'disqus', 'nav', 'notes', 'disqus_thread']}),
|
||||
dict(id=['sidebar', 'footer', 'disqus', 'nav', 'notes', 'likes_container', 'description', 'disqus_thread', 'about']),
|
||||
dict(name=['script', 'noscript', 'style'])]
|
||||
|
||||
|
||||
|
||||
feeds = [(u'Smarter Planet Tumblr', u'http://smarterplanet.tumblr.com/mobile/rss')]
|
||||
|
182
resources/recipes/ihned.recipe
Normal file
182
resources/recipes/ihned.recipe
Normal file
@ -0,0 +1,182 @@
|
||||
import re, time
|
||||
from calibre import strftime
|
||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||
|
||||
class IHNed(BasicNewsRecipe):
|
||||
|
||||
|
||||
stahnout_vsechny = False
|
||||
#True = stahuje vsechny z homepage
|
||||
#False = stahuje pouze dnesni clanky (ze dne, kdy je skript spusten)
|
||||
|
||||
title = 'iHNed'
|
||||
__author__ = 'Karel Bílek'
|
||||
language = 'cs'
|
||||
description = 'Zprávy z iHNed.cz'
|
||||
timefmt = ' [%a, %d %b, %Y]'
|
||||
needs_subscription = False
|
||||
remove_tags = [dict(attrs={'class':['borderbottom', 'web', 'foot', 'reklama', 'd-elm d-rellinks', 'd-elm']}),
|
||||
dict(style=['text-align: center;']),
|
||||
dict(id=['r-bfull']),
|
||||
dict(name=['script', 'noscript', 'style'])]
|
||||
encoding = 'windows-1250'
|
||||
no_stylesheets = True
|
||||
remove_tags_before = dict(attrs={'class':'d-nadtit'})
|
||||
remove_tags_after = dict(attrs={'class':'like'})
|
||||
|
||||
conversion_options = {
|
||||
'linearize_tables' : True,
|
||||
}
|
||||
|
||||
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
|
||||
def makeurl(wat):
|
||||
return "http://ihned.cz"+wat;
|
||||
|
||||
for h1 in soup.findAll('h1'):
|
||||
a = h1.find('a')
|
||||
if a:
|
||||
string = a.string
|
||||
if string:
|
||||
soup.a.replaceWith(string)
|
||||
for a in soup.findAll('a', href=True) :
|
||||
cil = str(a['href'])
|
||||
if cil.startswith("/") or cil.startswith("index"):
|
||||
a['href'] = makeurl(cil)
|
||||
return soup
|
||||
|
||||
|
||||
def parse_index(self):
|
||||
|
||||
def makeurl(wat):
|
||||
if wat.startswith("/") or wat.startswith("index"):
|
||||
return "http://ihned.cz"+wat;
|
||||
else:
|
||||
return wat
|
||||
|
||||
|
||||
articles = {} #vysledek, asi
|
||||
key = None #soucasna sekce
|
||||
ans = [] #vsechny sekce
|
||||
|
||||
articles["Hlavní"] = []
|
||||
ans.append("Hlavní")
|
||||
|
||||
was = {}
|
||||
|
||||
def parse_subpage(url, name):
|
||||
articles[name] = []
|
||||
ans.append(name)
|
||||
|
||||
|
||||
soup = self.index_to_soup(url)
|
||||
otvirak = soup.find(True, attrs={'class':['otv']})
|
||||
if otvirak:
|
||||
|
||||
#the code is copypasted here because I don't know python. simple as that.
|
||||
a = otvirak.find('a', href=True)
|
||||
title = self.tag_to_string(a, use_alt=True).strip()
|
||||
txt = otvirak.find(True, attrs={'class':['txt']})
|
||||
description = ''
|
||||
if txt:
|
||||
match = re.match(r'<div class="txt">\s*([^<]*)\s*<a', str(txt), re.L)
|
||||
if match:
|
||||
description = match.group(1)
|
||||
|
||||
pubdate = strftime('%d. %m.')
|
||||
if not title in was:
|
||||
articles[name].append(
|
||||
dict(title=title, url=makeurl(a['href']), date=pubdate,
|
||||
description=description,
|
||||
content=''))
|
||||
|
||||
otv234 = soup.find(True, attrs={'class':['otv234', 'col2a']})
|
||||
if otv234:
|
||||
for ow in otv234.findAll(True, attrs={'class':['ow']}):
|
||||
a = ow.find('a', href=True)
|
||||
title = self.tag_to_string(a, use_alt=True).strip()
|
||||
description=''
|
||||
prx = ow.find(True, attrs={'class':['prx']});
|
||||
if prx:
|
||||
description = str(prx.string)
|
||||
nfo = ow.find(True, attrs={'class':['nfo']});
|
||||
pubdate = ''
|
||||
if nfo:
|
||||
dtime = time.localtime();
|
||||
day = dtime[2]
|
||||
month = dtime[1]
|
||||
|
||||
pubdate = strftime('%d. %m.')
|
||||
|
||||
match = re.search(r'([0-9]*)\.([0-9]*)\.', str(nfo))
|
||||
|
||||
if self.stahnout_vsechny or (int(day) == int(match.group(1)) and int(month) == int(match.group(2))):
|
||||
if not title in was:
|
||||
articles[name].append(
|
||||
dict(title=title, url=makeurl(a['href']), date=pubdate,
|
||||
description=description,
|
||||
content=''))
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
soup = self.index_to_soup('http://ihned.cz/')
|
||||
otvirak = soup.find(True, attrs={'class':['otv']})
|
||||
if otvirak:
|
||||
a = otvirak.find('a', href=True)
|
||||
title = self.tag_to_string(a, use_alt=True).strip()
|
||||
txt = otvirak.find(True, attrs={'class':['txt']})
|
||||
description = ''
|
||||
if txt:
|
||||
match = re.match(r'<div class="txt">\s*([^<]*)\s*<a', str(txt), re.L)
|
||||
if match:
|
||||
description = match.group(1)
|
||||
|
||||
pubdate = strftime('%d. %m.')
|
||||
feed = "Hlavní"
|
||||
articles[feed].append(
|
||||
dict(title=title, url=(a['href']), date=pubdate,
|
||||
description=description,
|
||||
content=''))
|
||||
was[title]=1
|
||||
|
||||
otvirak2345 = soup.find(True, attrs={'class':['otv2345']})
|
||||
if otvirak2345:
|
||||
for otv2 in otvirak2345.findAll(True, attrs={'class':['otv2-5']}):
|
||||
a = otv2.find('a', attrs={'class':['tit2']}, href=True)
|
||||
title = self.tag_to_string(a, use_alt=True).strip()
|
||||
description=''
|
||||
span = otv2.find('span');
|
||||
if span:
|
||||
match = re.match(r'<span>\s*([^<]*)\s*<a', str(span), re.L)
|
||||
if match:
|
||||
description = match.group(1)
|
||||
feed = "Hlavní"
|
||||
pubdate = strftime('%d. %m.')
|
||||
articles[feed].append(
|
||||
dict(title=title, url=(a['href']), date=pubdate,
|
||||
description=description,
|
||||
content=''))
|
||||
was[title]=1
|
||||
|
||||
|
||||
parse_subpage("http://komentare.ihned.cz/", "Komentáře")
|
||||
parse_subpage("http://domaci.ihned.cz", "Domácí")
|
||||
parse_subpage("http://ekonomika.ihned.cz", "Ekonomika")
|
||||
parse_subpage("http://zahranicni.ihned.cz/", "Zahraničí");
|
||||
parse_subpage("http://finweb.ihned.cz/", "Finance");
|
||||
parse_subpage("http://digiweb.ihned.cz/", "DigiWeb");
|
||||
parse_subpage("http://kultura.ihned.cz/", "Kultura")
|
||||
parse_subpage("http://sport.ihned.cz/", "Sport");
|
||||
|
||||
#seradi kategorie
|
||||
ans = self.sort_index_by(ans, {'Hlavni':1, 'Domácí':2, 'Ekonomika':5, 'Zahraničí':3, 'Finance':6, 'DigiWeb':7, 'Kultura':8, 'Sport':9, 'Komentáře':4})
|
||||
|
||||
#vrati, ale pouze, kdyz je v kategoriich...
|
||||
ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
|
||||
return ans
|
||||
|
52
resources/recipes/karlsruhe.recipe
Normal file
52
resources/recipes/karlsruhe.recipe
Normal file
@ -0,0 +1,52 @@
|
||||
import re
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class KANewsRecipe(BasicNewsRecipe):
|
||||
title = u'KA-News.de'
|
||||
description = u'Nachrichten aus Karlsruhe, Deutschland und der Welt.'
|
||||
__author__ = 'tfeld'
|
||||
lang='de'
|
||||
no_stylesheets = True
|
||||
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 100
|
||||
|
||||
feeds = [
|
||||
(u'News aus Karlsruhe', 'http://www.ka-news.de/storage/rss/rss/karlsruhe.xml'),
|
||||
(u'Kulturnachrichten aus Karlsruhe', 'http://www.ka-news.de/storage/rss/rss/kultur.xml'),
|
||||
(u'Durlach: News aus Durlach', 'http://www.ka-news.de/storage/rss/rss/durlach.xml'),
|
||||
(u'Stutensee: News aus Stutensee Blankenloch, Büchig, Friedrichstal, Staffort, Spöck', 'http://www.ka-news.de/storage/rss/rss/stutensee.xml'),
|
||||
(u'Bruchsal: News aus Bruchsal', 'http://www.ka-news.de/storage/rss/rss/bruchsal.xml'),
|
||||
(u'Wirtschaftsnews aus Karlsruhe', 'http://www.ka-news.de/storage/rss/rss/wirtschaft.xml'),
|
||||
(u'ka-news.de - Sport', 'http://www.ka-news.de/storage/rss/rss/sport.xml'),
|
||||
(u'KSC-News - News rund um den KSC', 'http://www.ka-news.de/storage/rss/rss/ksc.xml'),
|
||||
(u'ka-news.de - BG Karlsruhe', 'http://www.ka-news.de/storage/rss/rss/basketball.xml')
|
||||
]
|
||||
|
||||
preprocess_regexps = [
|
||||
(re.compile(r'width:[0-9]*?px', re.DOTALL|re.IGNORECASE), lambda match: ''),
|
||||
]
|
||||
|
||||
remove_tags_before = dict(id='artdetail_ueberschrift')
|
||||
remove_tags_after = dict(id='artdetail_unterzeile')
|
||||
remove_tags = [dict(name=['div'], attrs={'class': 'lbx_table'}),
|
||||
dict(name=['div'], attrs={'class': 'lk_zumthema'}),
|
||||
dict(name=['div'], attrs={'class': 'lk_thumb'}),
|
||||
dict(name=['div'], attrs={'class': 'lk_trenner'}),
|
||||
dict(name=['div'], attrs={'class': 'lupen_container'}),
|
||||
dict(name=['script']),
|
||||
dict(name=['span'], attrs={'style': 'display:none;'}),
|
||||
dict(name=['span'], attrs={'class': 'comm_info'}),
|
||||
dict(name=['h3'], attrs={'id': 'artdetail_unterzeile'})]
|
||||
|
||||
# removing style attribute _after_ removing specifig tags above
|
||||
remove_attributes = ['width','height','style']
|
||||
|
||||
extra_css = '''
|
||||
h1{ font-size:large; font-weight:bold; }
|
||||
h2{ font-size:medium; font-weight:bold; }
|
||||
'''
|
||||
|
||||
def get_cover_url(self):
|
||||
return 'http://www.ka-news.de/storage/scl/techkanews/logos/434447_m1t1w250q75s1v29681_ka-news-Logo_mit_Schatten_transparent.png'
|
||||
|
16
resources/recipes/kath_net.recipe
Normal file
16
resources/recipes/kath_net.recipe
Normal file
@ -0,0 +1,16 @@
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class AdvancedUserRecipe1295262156(BasicNewsRecipe):
|
||||
title = u'kath.net'
|
||||
__author__ = 'Bobus'
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 100
|
||||
|
||||
feeds = [(u'kath.net', u'http://www.kath.net/2005/xml/index.xml')]
|
||||
|
||||
|
||||
def print_version(self, url):
|
||||
return url+"&print=yes"
|
||||
|
||||
extra_css = 'td.textb {font-size: medium;}'
|
||||
|
@ -28,6 +28,8 @@ class LaRepubblica(BasicNewsRecipe):
|
||||
recursion = 10
|
||||
|
||||
remove_javascript = True
|
||||
no_stylesheets = True
|
||||
|
||||
def get_article_url(self, article):
|
||||
link = article.get('id', article.get('guid', None))
|
||||
if link is None:
|
||||
|
@ -1,4 +1,3 @@
|
||||
#!/usr/bin/env python
|
||||
__license__ = 'GPL v3'
|
||||
__author__ = 'Lorenzo Vigentini'
|
||||
__copyright__ = '2009, Lorenzo Vigentini <l.vigentini at gmail.com>'
|
||||
@ -14,7 +13,7 @@ from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class ledevoir(BasicNewsRecipe):
|
||||
author = 'Lorenzo Vigentini'
|
||||
description = 'Canadian Paper'
|
||||
description = 'Canadian Paper. A subscription is optional, with it you get more content'
|
||||
|
||||
cover_url = 'http://www.ledevoir.com/images/ul/graphiques/logo_devoir.gif'
|
||||
title = u'Le Devoir'
|
||||
@ -28,6 +27,7 @@ class ledevoir(BasicNewsRecipe):
|
||||
max_articles_per_feed = 50
|
||||
use_embedded_content = False
|
||||
recursion = 10
|
||||
needs_subscription = 'optional'
|
||||
|
||||
remove_javascript = True
|
||||
no_stylesheets = True
|
||||
@ -77,3 +77,12 @@ class ledevoir(BasicNewsRecipe):
|
||||
.credit {color:#787878;font-size:0.71em;line-height:1.1em;font-weight:bold;}
|
||||
.texte {font-size:1.15em;line-height:1.4em;margin-bottom:17px;}
|
||||
'''
|
||||
def get_browser(self):
|
||||
br = BasicNewsRecipe.get_browser()
|
||||
if self.username is not None and self.password is not None:
|
||||
br.open('http://www.ledevoir.com')
|
||||
br.select_form(nr=1)
|
||||
br['login[courriel]'] = self.username
|
||||
br['login[password]'] = self.password
|
||||
br.submit()
|
||||
return br
|
||||
|
32
resources/recipes/mail_and_guardian.recipe
Normal file
32
resources/recipes/mail_and_guardian.recipe
Normal file
@ -0,0 +1,32 @@
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class AdvancedUserRecipe1295081935(BasicNewsRecipe):
|
||||
title = u'Mail & Guardian ZA News'
|
||||
__author__ = '77ja65'
|
||||
language = 'en'
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 30
|
||||
no_stylesheets = True
|
||||
masthead_url = 'http://c1608832.cdn.cloudfiles.rackspacecloud.com/mg_logo.gif'
|
||||
remove_tags_after = [dict(id='content')]
|
||||
|
||||
feeds = [
|
||||
(u'National News', u'http://www.mg.co.za/rss/national'),
|
||||
(u'Top Stories', u'http://www.mg.co.za/rss'),
|
||||
(u'Africa News', u'http://www.mg.co.za/rss/africa'),
|
||||
(u'Sport', u'http://www.mg.co.za/rss/sport'),
|
||||
(u'Business', u'http://www.mg.co.za/rss/business'),
|
||||
(u'And In Other News', u'http://www.mg.co.za/rss/and-in-other-news'),
|
||||
(u'World News', u'http://www.mg.co.za/rss/world')
|
||||
]
|
||||
|
||||
def print_version(self, url):
|
||||
return url.replace('http://www.mg.co.za/article/',
|
||||
'http://www.mg.co.za/printformat/single/')
|
||||
|
||||
extra_css = '''
|
||||
h1{font-family:Arial,Helvetica,sans-serif; font-
|
||||
weight:bold;font-size:large;}
|
||||
h2{font-family:Arial,Helvetica,sans-serif; font-
|
||||
weight:normal;font-size:small;}
|
||||
'''
|
@ -1,10 +1,9 @@
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>'
|
||||
__copyright__ = '2010-2011, Darko Miletic <darko.miletic at gmail.com>'
|
||||
'''
|
||||
msnbc.msn.com
|
||||
'''
|
||||
|
||||
import re
|
||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||
|
||||
class MsNBC(BasicNewsRecipe):
|
||||
@ -19,7 +18,16 @@ class MsNBC(BasicNewsRecipe):
|
||||
publisher = 'msnbc.com'
|
||||
category = 'news, USA, world'
|
||||
language = 'en'
|
||||
extra_css = ' body{ font-family: sans-serif } .head{font-family: serif; font-size: xx-large; font-weight: bold; color: #CC0000} .abstract{font-weight: bold} .source{font-size: small} .updateTime{font-size: small} '
|
||||
extra_css = """
|
||||
body{ font-family: Georgia,Times,serif }
|
||||
.hide{display: none}
|
||||
.caption{font-family: Arial,sans-serif; font-size: x-small}
|
||||
.entry-summary{font-family: Arial,sans-serif}
|
||||
.copyright{font-size: 0.95em; font-style: italic}
|
||||
.source-org{font-size: small; font-family: Arial,sans-serif}
|
||||
img{display: block; margin-bottom: 0.5em}
|
||||
span.byline{display: none}
|
||||
"""
|
||||
|
||||
conversion_options = {
|
||||
'comments' : description
|
||||
@ -28,14 +36,20 @@ class MsNBC(BasicNewsRecipe):
|
||||
,'publisher': publisher
|
||||
}
|
||||
|
||||
preprocess_regexps = [
|
||||
(re.compile(r'</style></head>', re.DOTALL|re.IGNORECASE),lambda match: '</style>')
|
||||
,(re.compile(r'<div class="head">', re.DOTALL|re.IGNORECASE),lambda match: '</head><body><div class="head">'),
|
||||
]
|
||||
remove_tags_before = dict(name='h1', attrs={'id':'headline'})
|
||||
remove_tags_after = dict(name='span', attrs={'class':['copyright','Linear copyright']})
|
||||
keep_only_tags=[
|
||||
dict(attrs={'id':['headline','deck','byline','source','intelliTXT']})
|
||||
,dict(attrs={'class':['gl_headline','articleText','drawer-content Linear','v-center3','byline','textBodyBlack']})
|
||||
]
|
||||
remove_attributes=['property','lang','rel','xmlns:fb','xmlns:v','xmlns:dc','xmlns:dcmitype','xmlns:og','xmlns:media','xmlns:vcard','typeof','itemscope','itemtype','itemprop','about','type','size','width','height','onreadystatechange','data','border','hspace','vspace']
|
||||
|
||||
remove_tags = [
|
||||
dict(name=['iframe','object','link','embed','meta','table'])
|
||||
,dict(name='span', attrs={'class':['copyright','Linear copyright']})
|
||||
,dict(name='div', attrs={'class':'social'})
|
||||
]
|
||||
|
||||
remove_tags_before = dict(name='div', attrs={'class':'head'})
|
||||
remove_tags_after = dict(name='div', attrs={'class':'copyright'})
|
||||
remove_tags = [dict(name=['iframe','object','link','script','form'])]
|
||||
|
||||
feeds = [
|
||||
(u'US News' , u'http://rss.msnbc.msn.com/id/3032524/device/rss/rss.xml' )
|
||||
@ -48,11 +62,26 @@ class MsNBC(BasicNewsRecipe):
|
||||
,(u'Tech & Science', u'http://rss.msnbc.msn.com/id/3032117/device/rss/rss.xml' )
|
||||
]
|
||||
|
||||
def print_version(self, url):
|
||||
return url + 'print/1/displaymode/1098/'
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
for item in soup.head.findAll('div'):
|
||||
item.extract()
|
||||
for item in soup.body.findAll('html'):
|
||||
item.name='div'
|
||||
for item in soup.body.findAll('div'):
|
||||
if item.has_key('id') and item['id'].startswith('vine-'):
|
||||
item.extract()
|
||||
if item.has_key('class') and ( item['class'].startswith('ad') or item['class'].startswith('vine')):
|
||||
item.extract()
|
||||
for item in soup.body.findAll('img'):
|
||||
if not item.has_key('alt'):
|
||||
item['alt'] = 'image'
|
||||
for item in soup.body.findAll('ol'):
|
||||
if item.has_key('class') and item['class'].startswith('grid'):
|
||||
item.extract()
|
||||
for item in soup.body.findAll('span'):
|
||||
if ( item.has_key('id') and item['id'].startswith('byLine') and item.string is None) or ( item.has_key('class') and item['class'].startswith('inline') ):
|
||||
item.extract()
|
||||
for alink in soup.findAll('a'):
|
||||
if alink.string is not None:
|
||||
tstr = alink.string
|
||||
alink.replaceWith(tstr)
|
||||
return soup
|
||||
|
||||
|
74
resources/recipes/new_london_day.recipe
Normal file
74
resources/recipes/new_london_day.recipe
Normal file
@ -0,0 +1,74 @@
|
||||
__license__ = 'GPL 3'
|
||||
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class AdvancedUserRecipe1294342201(BasicNewsRecipe):
|
||||
title = u'New London Day'
|
||||
__author__ = 'Being'
|
||||
description = 'State, local and business news from New London, CT'
|
||||
language = 'en_GB'
|
||||
oldest_article = 1
|
||||
max_articles_per_feed = 200
|
||||
|
||||
use_embedded_content = False
|
||||
no_stylesheets = True
|
||||
remove_javascript = True
|
||||
remove_tags_before = dict(id='article')
|
||||
remove_tags_after = dict(id='article')
|
||||
remove_tags = [dict(attrs={'class':['articleTools', 'post-tools', 'side_tool', 'nextArticleLink clearfix']}),
|
||||
dict(id=['footer', 'toolsRight', 'articleInline', 'navigation', 'archive', 'side_search', 'blog_sidebar', 'side_tool', 'side_index']),
|
||||
dict(name=['script', 'noscript', 'style'])]
|
||||
remove_tags_after = [ {'class':['photo_article',]} ]
|
||||
remove_tags = [{'id':["moduleArticleTools","content-bottom","rail","articleRelates module","toolSet","relatedrailcontent","div-wrapper","beta","atp-comments","footer"]},
|
||||
{'class':["clearfix","relatedTitle","articleRelates module","asset-footer","tools","comments","featurePromo","featurePromo fp-topjobs brownBackground","clearfix fullSpan brownBackground","curvedContent"]},
|
||||
dict(name='font',attrs={'id':["cr-other-headlines"]})]
|
||||
extra_css = '''
|
||||
h1{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:large;}
|
||||
h2{font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:small;}
|
||||
.byline {font-family:Arial,Helvetica,sans-serif; font-size:xx-small;}
|
||||
.date {font-family:Arial,Helvetica,sans-serif; font-size:xx-small;}
|
||||
p{font-family:Arial,Helvetica,sans-serif;font-size:small;}
|
||||
.copyright {font-family:Arial,Helvetica,sans-serif;font-size:xx-small;text-align:center}
|
||||
.story{font-family:Arial,Helvetica,sans-serif;font-size:small;}
|
||||
.entry-asset asset hentry{font-family:Arial,Helvetica,sans-serif;font-size:small;}
|
||||
.pagebody{font-family:Arial,Helvetica,sans-serif;font-size:small;}
|
||||
.maincontentcontainer{font-family:Arial,Helvetica,sans-serif;font-size:small;}
|
||||
.story-body{font-family:Arial,Helvetica,sans-serif;font-size:small;}
|
||||
body{font-family:Helvetica,Arial,sans-serif;font-size:small;}
|
||||
'''
|
||||
|
||||
feeds = [
|
||||
(u'All News', u'http://www.theday.com/section/rss'),
|
||||
(u'Breaking News', u'http://www.theday.com/section/rss01'),
|
||||
(u'Police and Courts', u'http://www.theday.com/section/rss02'),
|
||||
(u'State News', u'http://www.theday.com/section/rss03'),
|
||||
(u'Local Business', u'http://www.theday.com/section/rss04'),
|
||||
(u'Entertainment', u'http://www.theday.com/section/rss05'),
|
||||
(u'Opinion', u'http://www.theday.com/section/rss06'),
|
||||
(u'Casinos', u'http://www.theday.com/section/rss12'),
|
||||
(u'Defense and Military', u'http://www.theday.com/section/rss14'),
|
||||
(u'Ann Baldelli Ruminations', u'http://www.theday.com/section/rss20'),
|
||||
(u'Paul Choiniere Ruminations', u'http://www.theday.com/section/rss21'),
|
||||
(u'Michael Costanza Omnivore', u'http://www.theday.com/section/rss23'),
|
||||
(u'Rebecca Dangelo Reel Life', u'http://www.theday.com/section/rss25'),]
|
||||
|
||||
def print_version(self, url):
|
||||
return url.replace('/index.html', '/print.html')
|
||||
|
||||
def get_article_url(self, article):
|
||||
return article.get('feedburner_origlink', article.get('guid', article.get('link')))
|
||||
|
||||
|
||||
def postprocess_html(self, soup, first_fetch):
|
||||
for t in soup.findAll(['table', 'tr', 'td']):
|
||||
t.name = 'div'
|
||||
|
||||
for tag in soup.findAll('form', dict(attrs={'name':["comments_form"]})):
|
||||
tag.extract()
|
||||
for tag in soup.findAll('font', dict(attrs={'id':["cr-other-headlines"]})):
|
||||
tag.extract()
|
||||
|
||||
return soup
|
||||
|
@ -27,6 +27,9 @@ class NikkeiNet_sub_economy(BasicNewsRecipe):
|
||||
{'class':"JSID_basePageMove JSID_baseAsyncSubmit cmn-form_area JSID_optForm_utoken"},
|
||||
{'class':"cmn-article_keyword cmn-clearfix"},
|
||||
{'class':"cmn-print_headline cmn-clearfix"},
|
||||
{'class':"cmn-article_list"},
|
||||
dict(id="ABOUT-NIKKEI"),
|
||||
{'class':"cmn-sub_market"},
|
||||
]
|
||||
remove_tags_after = {'class':"cmn-pr_list"}
|
||||
|
||||
|
32
resources/recipes/njp.recipe
Normal file
32
resources/recipes/njp.recipe
Normal file
@ -0,0 +1,32 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = u'Chema Cort\xe9s - 2011-01-05'
|
||||
__version__ = 'v0.01'
|
||||
__date__ = '2011-01-05'
|
||||
'''
|
||||
njp.org
|
||||
'''
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class NewJournalOfPhysics(BasicNewsRecipe):
|
||||
title = u'New Journal of Physics'
|
||||
__author__ = u'Chema Cort\xe9s'
|
||||
description = u'The open-access journal for physics'
|
||||
publisher = u'IOP (Institute of Physics)'
|
||||
category = 'physics, journal, science'
|
||||
language = 'en'
|
||||
|
||||
oldest_article = 30
|
||||
max_articles_per_feed = 100
|
||||
|
||||
keep_only_tags = [dict(id=['fulltextContainer'])]
|
||||
no_stylesheets=True
|
||||
use_embedded_content=False
|
||||
|
||||
feeds = [(u'Latest Papers', u'http://iopscience.iop.org/1367-2630/?rss=1')]
|
||||
|
||||
def print_version(self, url):
|
||||
return url+"/fulltext"
|
@ -685,3 +685,28 @@ class NYTimes(BasicNewsRecipe):
|
||||
divTag.replaceWith(tag)
|
||||
|
||||
return soup
|
||||
|
||||
def populate_article_metadata(self, article, soup, first):
|
||||
shortparagraph = ""
|
||||
try:
|
||||
if len(article.text_summary.strip()) == 0:
|
||||
articlebodies = soup.findAll('div',attrs={'class':'articleBody'})
|
||||
if articlebodies:
|
||||
for articlebody in articlebodies:
|
||||
if articlebody:
|
||||
paras = articlebody.findAll('p')
|
||||
for p in paras:
|
||||
refparagraph = self.massageNCXText(self.tag_to_string(p,use_alt=False)).strip()
|
||||
#account for blank paragraphs and short paragraphs by appending them to longer ones
|
||||
if len(refparagraph) > 0:
|
||||
if len(refparagraph) > 70: #approximately one line of text
|
||||
article.summary = article.text_summary = shortparagraph + refparagraph
|
||||
return
|
||||
else:
|
||||
shortparagraph = refparagraph + " "
|
||||
if shortparagraph.strip().find(" ") == -1 and not shortparagraph.strip().endswith(":"):
|
||||
shortparagraph = shortparagraph + "- "
|
||||
except:
|
||||
self.log("Error creating article descriptions")
|
||||
return
|
||||
|
||||
|
@ -685,4 +685,27 @@ class NYTimes(BasicNewsRecipe):
|
||||
divTag.replaceWith(tag)
|
||||
|
||||
return soup
|
||||
def populate_article_metadata(self, article, soup, first):
|
||||
shortparagraph = ""
|
||||
try:
|
||||
if len(article.text_summary.strip()) == 0:
|
||||
articlebodies = soup.findAll('div',attrs={'class':'articleBody'})
|
||||
if articlebodies:
|
||||
for articlebody in articlebodies:
|
||||
if articlebody:
|
||||
paras = articlebody.findAll('p')
|
||||
for p in paras:
|
||||
refparagraph = self.massageNCXText(self.tag_to_string(p,use_alt=False)).strip()
|
||||
#account for blank paragraphs and short paragraphs by appending them to longer ones
|
||||
if len(refparagraph) > 0:
|
||||
if len(refparagraph) > 70: #approximately one line of text
|
||||
article.summary = article.text_summary = shortparagraph + refparagraph
|
||||
return
|
||||
else:
|
||||
shortparagraph = refparagraph + " "
|
||||
if shortparagraph.strip().find(" ") == -1 and not shortparagraph.strip().endswith(":"):
|
||||
shortparagraph = shortparagraph + "- "
|
||||
except:
|
||||
self.log("Error creating article descriptions")
|
||||
return
|
||||
|
||||
|
61
resources/recipes/pressthink.recipe
Normal file
61
resources/recipes/pressthink.recipe
Normal file
@ -0,0 +1,61 @@
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2011, Darko Miletic <darko.miletic at gmail.com>'
|
||||
'''
|
||||
pressthink.org
|
||||
'''
|
||||
|
||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||
|
||||
class PressThink(BasicNewsRecipe):
|
||||
title = 'PressThink'
|
||||
__author__ = 'Darko Miletic'
|
||||
description = 'Ghost of democracy in the media machine'
|
||||
oldest_article = 60
|
||||
max_articles_per_feed = 100
|
||||
no_stylesheets = True
|
||||
use_embedded_content = False
|
||||
encoding = 'utf8'
|
||||
publisher = 'Arthur L. Carter Journalism Institute'
|
||||
category = 'news, USA, world, economy, politics, media'
|
||||
language = 'en'
|
||||
publication_type = 'blog'
|
||||
extra_css = """
|
||||
body{ font-family: Helvetica,Arial,sans-serif }
|
||||
img{display: block; margin-bottom: 0.5em}
|
||||
h6{font-size: 1.1em; font-weight: bold}
|
||||
.post-author{font-family: Georgia,serif}
|
||||
.post-title{color: #AB0000}
|
||||
.says{color: gray}
|
||||
.comment {
|
||||
border-bottom: 1px dotted #555555;
|
||||
border-top: 1px dotted #DDDDDD;
|
||||
margin-left: 10px;
|
||||
min-height: 100px;
|
||||
padding: 15px 0 20px;
|
||||
}
|
||||
"""
|
||||
|
||||
conversion_options = {
|
||||
'comments' : description
|
||||
,'tags' : category
|
||||
,'language' : language
|
||||
,'publisher': publisher
|
||||
}
|
||||
|
||||
remove_tags = [dict(name=['form','iframe','embed','object','link','base','table','meta'])]
|
||||
keep_only_tags = [dict(attrs={'class':['post-title','post-author','entry','postmetadata alt','commentlist']})]
|
||||
|
||||
feeds = [(u'Articles', u'http://pressthink.org/feed/')]
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
for item in soup.findAll(style=True):
|
||||
del item['style']
|
||||
for item in soup.findAll('img', alt=False):
|
||||
item['alt'] = 'image'
|
||||
for alink in soup.findAll('a'):
|
||||
if alink.string is not None:
|
||||
tstr = alink.string
|
||||
alink.replaceWith(tstr)
|
||||
return soup
|
||||
|
||||
|
@ -17,8 +17,8 @@ class SmithsonianMagazine(BasicNewsRecipe):
|
||||
remove_tags = [
|
||||
dict(name='iframe'),
|
||||
dict(name='div', attrs={'class':'article_sidebar_border'}),
|
||||
dict(name='div', attrs={'id':['article_sidebar_border', 'most-popular_large']}),
|
||||
#dict(name='ul', attrs={'class':'article-tools'}),
|
||||
dict(name='div', attrs={'id':['article_sidebar_border', 'most-popular_large', 'most-popular-body_large']}),
|
||||
##dict(name='ul', attrs={'class':'article-tools'}),
|
||||
dict(name='ul', attrs={'class':'cat-breadcrumb col three last'}),
|
||||
]
|
||||
|
||||
@ -37,16 +37,16 @@ class SmithsonianMagazine(BasicNewsRecipe):
|
||||
]
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
story = soup.find(name='div', attrs={'id':'article-left'})
|
||||
#td = heading.findParent(name='td')
|
||||
#td.extract()
|
||||
story = soup.find(name='div', attrs={'id':'article-body'})
|
||||
##td = heading.findParent(name='td')
|
||||
##td.extract()
|
||||
soup = BeautifulSoup('<html><head><title>t</title></head><body></body></html>')
|
||||
body = soup.find(name='body')
|
||||
body.insert(0, story)
|
||||
return soup
|
||||
|
||||
def postprocess_html(self, soup, first):
|
||||
for p in soup.findAll(id='articlePaginationWrapper'): p.extract()
|
||||
if not first:
|
||||
for div in soup.findAll(id='article-head'): div.extract()
|
||||
return soup
|
||||
#def postprocess_html(self, soup, first):
|
||||
#for p in soup.findAll(id='articlePaginationWrapper'): p.extract()
|
||||
#if not first:
|
||||
#for div in soup.findAll(id='article-head'): div.extract()
|
||||
#return soup
|
||||
|
@ -1,5 +1,5 @@
|
||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||
from calibre.ebooks.BeautifulSoup import BeautifulSoup
|
||||
#from calibre.ebooks.BeautifulSoup import BeautifulSoup
|
||||
from urllib import quote
|
||||
|
||||
class SportsIllustratedRecipe(BasicNewsRecipe) :
|
||||
@ -91,7 +91,7 @@ class SportsIllustratedRecipe(BasicNewsRecipe) :
|
||||
# expire : no idea what value to use
|
||||
# All this comes from the Javascript function that redirects to the print version. It's called PT() and is defined in the file 48.js
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
'''def preprocess_html(self, soup):
|
||||
header = soup.find('div', attrs = {'class' : 'siv_artheader'})
|
||||
homeMadeSoup = BeautifulSoup('<html><head></head><body></body></html>')
|
||||
body = homeMadeSoup.body
|
||||
@ -115,4 +115,5 @@ class SportsIllustratedRecipe(BasicNewsRecipe) :
|
||||
body.append(para)
|
||||
|
||||
return homeMadeSoup
|
||||
'''
|
||||
|
||||
|
115
resources/recipes/sunday_times.recipe
Normal file
115
resources/recipes/sunday_times.recipe
Normal file
@ -0,0 +1,115 @@
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>'
|
||||
'''
|
||||
www.thesundaytimes.co.uk
|
||||
'''
|
||||
import urllib
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class TimesOnline(BasicNewsRecipe):
|
||||
title = 'The Sunday Times UK'
|
||||
__author__ = 'Darko Miletic'
|
||||
description = 'news from United Kingdom and World'
|
||||
language = 'en_GB'
|
||||
publisher = 'Times Newspapers Ltd'
|
||||
category = 'news, politics, UK'
|
||||
oldest_article = 3
|
||||
max_articles_per_feed = 100
|
||||
no_stylesheets = True
|
||||
use_embedded_content = False
|
||||
encoding = 'utf-8'
|
||||
delay = 1
|
||||
needs_subscription = True
|
||||
publication_type = 'newspaper'
|
||||
masthead_url = 'http://www.thesundaytimes.co.uk/sto/public/images/logos/logo-home.gif'
|
||||
INDEX = 'http://www.thesundaytimes.co.uk'
|
||||
PREFIX = u'http://www.thesundaytimes.co.uk/sto/'
|
||||
extra_css = """
|
||||
.author-name,.authorName{font-style: italic}
|
||||
.published-date,.multi-position-photo-text{font-family: Arial,Helvetica,sans-serif;
|
||||
font-size: small; color: gray;
|
||||
display:block; margin-bottom: 0.5em}
|
||||
body{font-family: Georgia,"Times New Roman",Times,serif}
|
||||
"""
|
||||
|
||||
conversion_options = {
|
||||
'comment' : description
|
||||
, 'tags' : category
|
||||
, 'publisher' : publisher
|
||||
, 'language' : language
|
||||
}
|
||||
|
||||
|
||||
def get_browser(self):
|
||||
br = BasicNewsRecipe.get_browser()
|
||||
br.open('http://www.timesplus.co.uk/tto/news/?login=false&url=http://www.thesundaytimes.co.uk/sto/')
|
||||
if self.username is not None and self.password is not None:
|
||||
data = urllib.urlencode({ 'userName':self.username
|
||||
,'password':self.password
|
||||
,'keepMeLoggedIn':'false'
|
||||
})
|
||||
br.open('https://www.timesplus.co.uk/iam/app/authenticate',data)
|
||||
return br
|
||||
|
||||
remove_tags = [
|
||||
dict(name=['object','link','iframe','base','meta'])
|
||||
,dict(attrs={'class':'tools comments-parent' })
|
||||
]
|
||||
remove_attributes=['lang']
|
||||
keep_only_tags = [
|
||||
dict(attrs={'class':'standard-content'})
|
||||
,dict(attrs={'class':'f-author'})
|
||||
,dict(attrs={'id':'bodycopy'})
|
||||
]
|
||||
remove_tags_after=dict(attrs={'class':'tools_border'})
|
||||
|
||||
feeds = [
|
||||
(u'UK News' , PREFIX + u'news/uk_news/' )
|
||||
,(u'World' , PREFIX + u'news/world_news/' )
|
||||
,(u'Politics' , PREFIX + u'news/Politics/' )
|
||||
,(u'Focus' , PREFIX + u'news/focus/' )
|
||||
,(u'Insight' , PREFIX + u'news/insight/' )
|
||||
,(u'Ireland' , PREFIX + u'news/ireland/' )
|
||||
,(u'Columns' , PREFIX + u'comment/columns/' )
|
||||
,(u'Arts' , PREFIX + u'culture/arts/' )
|
||||
,(u'Books' , PREFIX + u'culture/books/' )
|
||||
,(u'Film and TV' , PREFIX + u'culture/film_and_tv/' )
|
||||
,(u'Sport' , PREFIX + u'sport/' )
|
||||
,(u'Business' , PREFIX + u'business' )
|
||||
,(u'Money' , PREFIX + u'business/money/' )
|
||||
,(u'Style' , PREFIX + u'style/' )
|
||||
,(u'Travel' , PREFIX + u'travel/' )
|
||||
,(u'Clarkson' , PREFIX + u'ingear/clarkson/' )
|
||||
,(u'Cars' , PREFIX + u'ingear/cars/' )
|
||||
,(u'Bikes' , PREFIX + u'ingear/2_Wheels/' )
|
||||
,(u'Tech' , PREFIX + u'ingear/Tech___Games/' )
|
||||
,(u'Magazine' , PREFIX + u'Magazine/' )
|
||||
]
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
for item in soup.findAll(style=True):
|
||||
del item['style']
|
||||
return self.adeify_images(soup)
|
||||
|
||||
def parse_index(self):
|
||||
totalfeeds = []
|
||||
lfeeds = self.get_feeds()
|
||||
for feedobj in lfeeds:
|
||||
feedtitle, feedurl = feedobj
|
||||
self.report_progress(0, _('Fetching feed')+' %s...'%(feedtitle if feedtitle else feedurl))
|
||||
articles = []
|
||||
soup = self.index_to_soup(feedurl)
|
||||
for atag in soup.findAll('a',href=True):
|
||||
parentName = atag.parent.name
|
||||
title = self.tag_to_string(atag).strip()
|
||||
if (parentName == 'h2' or parentName == 'h3') and title is not None and title != '':
|
||||
url = self.INDEX + atag['href']
|
||||
articles.append({
|
||||
'title' :title
|
||||
,'date' :''
|
||||
,'url' :url
|
||||
,'description':''
|
||||
})
|
||||
totalfeeds.append((feedtitle, articles))
|
||||
return totalfeeds
|
@ -35,7 +35,6 @@ class TechnologyReview(BasicNewsRecipe):
|
||||
def get_article_url(self, article):
|
||||
return article.get('guid', article.get('id', None))
|
||||
|
||||
|
||||
def print_version(self, url):
|
||||
baseurl='http://www.technologyreview.com/printer_friendly_article.aspx?id='
|
||||
split1 = string.split(url,"/")
|
||||
@ -43,3 +42,25 @@ class TechnologyReview(BasicNewsRecipe):
|
||||
split2= string.split(xxx,"/")
|
||||
s = baseurl + split2[0]
|
||||
return s
|
||||
|
||||
|
||||
def postprocess_html(self,soup, True):
|
||||
#remove picture
|
||||
headerhtml = soup.find(True, {'class':'header'})
|
||||
headerhtml.replaceWith("")
|
||||
|
||||
#remove close button
|
||||
closehtml = soup.find(True, {'class':'close'})
|
||||
closehtml.replaceWith("")
|
||||
|
||||
#remove banner advertisement
|
||||
bannerhtml = soup.find(True, {'class':'bannerad'})
|
||||
bannerhtml.replaceWith("")
|
||||
|
||||
#thanks kiklop74! This code removes all links from the text
|
||||
for alink in soup.findAll('a'):
|
||||
if alink.string is not None:
|
||||
tstr = alink.string
|
||||
alink.replaceWith(tstr)
|
||||
|
||||
return soup
|
||||
|
25
resources/recipes/tri_city_herald.recipe
Normal file
25
resources/recipes/tri_city_herald.recipe
Normal file
@ -0,0 +1,25 @@
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class TriCityHeraldRecipe(BasicNewsRecipe):
|
||||
title = u'Tri-City Herald'
|
||||
description = 'The Tri-City Herald Mid-Columbia.'
|
||||
language = 'en'
|
||||
__author__ = 'Laura Gjovaag'
|
||||
oldest_article = 1.5
|
||||
max_articles_per_feed = 100
|
||||
no_stylesheets = True
|
||||
remove_javascript = True
|
||||
keep_only_tags = [
|
||||
dict(name='div', attrs={'id':'story_header'}),
|
||||
dict(name='img', attrs={'class':'imageCycle'}),
|
||||
dict(name='div', attrs={'id':['cycleImageCaption', 'story_body']})
|
||||
]
|
||||
remove_tags = [
|
||||
dict(name='div', attrs={'id':'story_mlt'}),
|
||||
dict(name='a', attrs={'id':'commentCount'}),
|
||||
dict(name=['script', 'noscript', 'style'])]
|
||||
extra_css = 'h1{font: bold 140%;} #cycleImageCaption{font: monospace 60%}'
|
||||
|
||||
feeds = [
|
||||
(u'Tri-City Herald Mid-Columbia', u'http://www.tri-cityherald.com/901/index.rss')
|
||||
]
|
80
resources/recipes/tyzden.recipe
Normal file
80
resources/recipes/tyzden.recipe
Normal file
@ -0,0 +1,80 @@
|
||||
#!/usr/bin/env python
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2011, Miroslav Vasko zemiak@gmail.com'
|
||||
|
||||
'''
|
||||
.tyzden, a weekly news magazine (a week old issue)
|
||||
'''
|
||||
from calibre import strftime
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
from datetime import date
|
||||
import re
|
||||
|
||||
class TyzdenRecipe(BasicNewsRecipe):
|
||||
__license__ = 'GPL v3'
|
||||
__author__ = 'zemiak'
|
||||
language = 'sk'
|
||||
version = 1
|
||||
|
||||
publisher = u'www.tyzden.sk'
|
||||
category = u'Magazine'
|
||||
description = u'A conservative weekly magazine. The latest free issue'
|
||||
|
||||
today = date.today()
|
||||
iso = today.isocalendar()
|
||||
year = iso[0]
|
||||
weeknum = iso[1]
|
||||
|
||||
if (weeknum > 1):
|
||||
weeknum -= 1
|
||||
|
||||
title = u'tyzden'
|
||||
|
||||
base_url_path = 'http://www.tyzden.sk/casopis/' + str(year) + '/' + str(weeknum)
|
||||
base_url = base_url_path + '.html'
|
||||
|
||||
oldest_article = 20
|
||||
max_articles_per_feed = 100
|
||||
remove_javascript = True
|
||||
|
||||
use_embedded_content = False
|
||||
no_stylesheets = True
|
||||
|
||||
keep_only_tags = []
|
||||
keep_only_tags.append(dict(name = 'h1'))
|
||||
keep_only_tags.append(dict(name = 'div', attrs = {'class': 'text_area top_nofoto'}))
|
||||
keep_only_tags.append(dict(name = 'div', attrs = {'class': 'text_block'}))
|
||||
|
||||
remove_tags_after = [dict(name = 'div', attrs = {'class': 'text_block'})]
|
||||
|
||||
def find_sections(self):
|
||||
soup = self.index_to_soup(self.base_url)
|
||||
# find cover pic
|
||||
imgdiv = soup.find('div', attrs = {'class': 'foto'})
|
||||
if imgdiv is not None:
|
||||
img = imgdiv.find('img')
|
||||
if img is not None:
|
||||
self.cover_url = 'http://www.tyzden.sk/' + img['src']
|
||||
# end find cover pic
|
||||
|
||||
for s in soup.findAll('a', attrs={'href': re.compile(r'rubrika/.*')}):
|
||||
yield (self.tag_to_string(s), s)
|
||||
|
||||
def find_articles(self, soup):
|
||||
for art in soup.findAllNext('a'):
|
||||
if (not art['href'].startswith('casopis/')):
|
||||
break;
|
||||
|
||||
url = art['href']
|
||||
title = self.tag_to_string(art)
|
||||
yield {
|
||||
'title': title, 'url':self.base_url_path + '/' + url, 'description':title,
|
||||
'date' : strftime('%a, %d %b'),
|
||||
}
|
||||
|
||||
def parse_index(self):
|
||||
feeds = []
|
||||
for title, soup in self.find_sections():
|
||||
feeds.append((title, list(self.find_articles(soup))))
|
||||
|
||||
return feeds
|
@ -1,4 +1,3 @@
|
||||
#!/usr/bin/env python
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
from __future__ import with_statement
|
||||
|
||||
@ -29,13 +28,16 @@ class AdvancedUserRecipe1249039563(BasicNewsRecipe):
|
||||
language = 'nl'
|
||||
|
||||
extra_css = '''
|
||||
body{font-family:Arial,Helvetica,sans-serif; font-size:small;}
|
||||
body{font-family:Arial,Helvetica,sans-serif;font-size:small;}
|
||||
h1{font-size:large;}
|
||||
'''
|
||||
'''
|
||||
Change Log:
|
||||
Date: 10/10/10 - Modified code to include obfuscated to get the print version
|
||||
Author: Tony Stegall
|
||||
|
||||
Date: 01/01/11 - Modified for better results around December/January.
|
||||
Author: Martin Tarenskeen
|
||||
'''
|
||||
#######################################################################################################
|
||||
temp_files = []
|
||||
@ -48,11 +50,17 @@ class AdvancedUserRecipe1249039563(BasicNewsRecipe):
|
||||
year = date.today().year
|
||||
|
||||
try:
|
||||
response = br.follow_link(url_regex='.*?(%d)(\\/)(article)(\\/)(print)(\\/)'%year, nr = 0)
|
||||
html = response.read()
|
||||
response = br.follow_link(url_regex='.*?(%d)(\\/)(article)(\\/)(print)(\\/)'%year, nr = 0)
|
||||
html = response.read()
|
||||
except:
|
||||
response = br.open(url)
|
||||
html = response.read()
|
||||
year = year-1
|
||||
try:
|
||||
response = br.follow_link(url_regex='.*?(%d)(\\/)(article)(\\/)(print)(\\/)'%year, nr = 0)
|
||||
html = response.read()
|
||||
except:
|
||||
response = br.open(url)
|
||||
html = response.read()
|
||||
|
||||
|
||||
self.temp_files.append(PersistentTemporaryFile('_fa.html'))
|
||||
self.temp_files[-1].write(html)
|
||||
@ -76,10 +84,3 @@ class AdvancedUserRecipe1249039563(BasicNewsRecipe):
|
||||
(u'Cultuur', u'http://www.volkskrant.nl/rss/kunst.rss'),
|
||||
(u'Gezondheid & Wetenschap', u'http://www.volkskrant.nl/rss/wetenschap.rss'),
|
||||
(u'Internet & Media', u'http://www.volkskrant.nl/rss/media.rss') ]
|
||||
|
||||
|
||||
'''
|
||||
example for formating
|
||||
'''
|
||||
# original url: http://www.volkskrant.nl/vk/nl/2668/Buitenland/article/detail/1031493/2010/10/10/Noord-Korea-ziet-nieuwe-leider.dhtml
|
||||
# print url : http://www.volkskrant.nl/vk/nl/2668/2010/article/print/detail/1031493/Noord-Korea-ziet-nieuwe-leider.dhtml
|
||||
|
44
resources/recipes/walla.recipe
Normal file
44
resources/recipes/walla.recipe
Normal file
@ -0,0 +1,44 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class AdvancedUserRecipe1283848012(BasicNewsRecipe):
|
||||
description = 'The WallaNews.'
|
||||
cover_url = 'http://ftp5.bizportal.co.il/web/giflib/news/rsPhoto/sz_5/rsz_220_220_logo_walla.gif'
|
||||
title = u'Walla'
|
||||
language = 'he'
|
||||
__author__ = 'marbs'
|
||||
extra_css='img {max-width:100%;} body{direction: rtl;},title{direction: rtl; } ,article_description{direction: rtl; }, a.article{direction: rtl; } ,calibre_feed_description{direction: rtl; }'
|
||||
simultaneous_downloads = 5
|
||||
# remove_javascript = True
|
||||
timefmt = '[%a, %d %b, %Y]'
|
||||
oldest_article = 1
|
||||
max_articles_per_feed = 100
|
||||
# remove_attributes = ['width']
|
||||
keep_only_tags =dict(name='div', attrs={'class':'wp-0-b w3'})
|
||||
remove_tags = [dict(name='div', attrs={'class':'tagsContainer'})]
|
||||
max_articles_per_feed = 100
|
||||
# preprocess_regexps = [
|
||||
# (re.compile(r'<p> </p>', re.DOTALL|re.IGNORECASE), lambda match: '')
|
||||
# ]
|
||||
|
||||
|
||||
feeds = [(u'חדשות', u'http://rss.walla.co.il/?w=/1/0/1/@rss'),
|
||||
(u'עסקים', u'http://rss.walla.co.il/?w=/2/3/1/@rss'),
|
||||
(u'תרבות', u'http://rss.walla.co.il/?w=/4/249/1/@rss'),
|
||||
(u'בריאות', u'http://rss.walla.co.il/?w=/5/18/1/@rss'),
|
||||
(u'TECH', u'http://rss.walla.co.il/?w=/6/4/1/@rss'),
|
||||
(u'אסטרולוגיה', u'http://rss.walla.co.il/?w=/8/3307/1/@rss'),
|
||||
(u'בעלי חיים', u'http://rss.walla.co.il/?w=/59/5703/1/@rss'),
|
||||
(u'רכב', u'http://rss.walla.co.il/?w=/31/4700/1/@rss'),
|
||||
(u'סלבס', u'http://rss.walla.co.il/?w=/22/3600/1/@rss'),
|
||||
(u'אוכל', u'http://rss.walla.co.il/?w=/9/903/1/@rss'),
|
||||
(u'אופנה', u'http://rss.walla.co.il/?w=/24/2120/1/@rss'),
|
||||
(u'ברנזה', u'http://rss.walla.co.il/?w=/27/3900/1/@rss'),
|
||||
(u'ZONE', u'http://rss.walla.co.il/?w=/18/500/1/@rss'),
|
||||
(u'ספורט', u'http://rss.walla.co.il/?w=/3/7/1/@rss')]
|
||||
|
||||
def print_version(self, url):
|
||||
print_url = url + '/@@/item/printer'
|
||||
return print_url
|
||||
|
29
resources/recipes/wichita_eagle.recipe
Normal file
29
resources/recipes/wichita_eagle.recipe
Normal file
@ -0,0 +1,29 @@
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class AdvancedUserRecipe1294938721(BasicNewsRecipe):
|
||||
title = u'Wichita Eagle'
|
||||
language = 'en'
|
||||
__author__ = 'Jason Cameron'
|
||||
description = 'Daily news from the Wichita Eagle'
|
||||
oldest_article = 1
|
||||
max_articles_per_feed = 30
|
||||
keep_only_tags = [dict(name='div', attrs={'id':'wide'})]
|
||||
feeds = [
|
||||
(u'Local News',
|
||||
u'http://www.kansas.com/news/local/index.rss'),
|
||||
(u'National News',
|
||||
u'http://www.kansas.com/news/nation-world/index.rss'),
|
||||
(u'Sports',
|
||||
u'http://www.kansas.com/sports/index.rss'),
|
||||
(u'Opinion',
|
||||
u'http://www.kansas.com/opinion/index.rss'),
|
||||
(u'Life',
|
||||
u'http://www.kansas.com/living/index.rss'),
|
||||
(u'Entertainment',
|
||||
u'http://www.kansas.com/entertainment/index.rss')
|
||||
]
|
||||
|
||||
def print_version(self, url):
|
||||
urlparts = url.split('/')
|
||||
newadd = urlparts[5]+'/v-print'
|
||||
return url.replace(url, newadd.join(url.split(urlparts[5])))
|
@ -2,8 +2,10 @@
|
||||
__license__ = 'GPL v3'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import re
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
from calibre.ebooks.chardet import xml_to_unicode
|
||||
|
||||
class Wired_Daily(BasicNewsRecipe):
|
||||
|
||||
@ -15,30 +17,43 @@ class Wired_Daily(BasicNewsRecipe):
|
||||
|
||||
no_stylesheets = True
|
||||
|
||||
preprocess_regexps = [(re.compile(r'<head.*</head>', re.DOTALL), lambda m:
|
||||
'<head></head>')]
|
||||
|
||||
remove_tags_before = dict(name='div', id='content')
|
||||
remove_tags = [dict(id=['social_tools', 'outerWrapper', 'sidebar',
|
||||
'footer', 'advertisement', 'blog_subscription_unit',
|
||||
'brightcove_component']),
|
||||
{'class':'entryActions'},
|
||||
dict(name=['noscript', 'script'])]
|
||||
remove_tags = [dict(id=['header', 'commenting_module', 'post_nav',
|
||||
'social_tools', 'sidebar', 'footer', 'social_wishlist', 'pgwidget',
|
||||
'outerWrapper', 'inf_widget']),
|
||||
{'class':['entryActions', 'advertisement', 'entryTags']},
|
||||
dict(name=['noscript', 'script']),
|
||||
dict(name='h4', attrs={'class':re.compile(r'rat\d+')}),
|
||||
{'class':lambda x: x and x.startswith('contentjump')},
|
||||
dict(name='li', attrs={'class':['entryCategories', 'entryEdit']})]
|
||||
|
||||
|
||||
feeds = [
|
||||
('Top News', 'http://feeds.wired.com/wired/index'),
|
||||
('Culture', 'http://feeds.wired.com/wired/culture'),
|
||||
('Software', 'http://feeds.wired.com/wired/software'),
|
||||
('Mac', 'http://feeds.feedburner.com/cultofmac/bFow'),
|
||||
('Gadgets', 'http://feeds.wired.com/wired/gadgets'),
|
||||
('Cars', 'http://feeds.wired.com/wired/cars'),
|
||||
('Entertainment', 'http://feeds.wired.com/wired/entertainment'),
|
||||
('Gaming', 'http://feeds.wired.com/wired/gaming'),
|
||||
('Science', 'http://feeds.wired.com/wired/science'),
|
||||
('Med Tech', 'http://feeds.wired.com/wired/medtech'),
|
||||
('Politics', 'http://feeds.wired.com/wired/politics'),
|
||||
('Tech Biz', 'http://feeds.wired.com/wired/techbiz'),
|
||||
('Commentary', 'http://feeds.wired.com/wired/commentary'),
|
||||
('Product Reviews',
|
||||
'http://www.wired.com/reviews/feeds/latestProductsRss'),
|
||||
('Autopia', 'http://www.wired.com/autopia/feed/'),
|
||||
('Danger Room', 'http://www.wired.com/dangerroom/feed/'),
|
||||
('Epicenter', 'http://www.wired.com/epicenter/feed/'),
|
||||
('Gadget Lab', 'http://www.wired.com/gadgetlab/feed/'),
|
||||
('Geek Dad', 'http://www.wired.com/geekdad/feed/'),
|
||||
('Playbook', 'http://www.wired.com/playbook/feed/'),
|
||||
('Rawfile', 'http://www.wired.com/rawfile/feed/'),
|
||||
('This Day in Tech', 'http://www.wired.com/thisdayintech/feed/'),
|
||||
('Threat Level', 'http://www.wired.com/threatlevel/feed/'),
|
||||
('Underwire', 'http://www.wired.com/underwire/feed/'),
|
||||
('Web Monkey', 'http://www.webmonkey.com/feed/'),
|
||||
('Science', 'http://www.wired.com/wiredscience/feed/'),
|
||||
]
|
||||
|
||||
def populate_article_metadata(self, article, soup, first):
|
||||
if article.text_summary:
|
||||
article.text_summary = xml_to_unicode(article.text_summary,
|
||||
resolve_entities=True)[0]
|
||||
|
||||
def print_version(self, url):
|
||||
return url.replace('http://www.wired.com/', 'http://www.wired.com/print/')
|
||||
|
||||
return url + '/all/1'
|
||||
|
||||
|
21
resources/recipes/yakima_herald.recipe
Normal file
21
resources/recipes/yakima_herald.recipe
Normal file
@ -0,0 +1,21 @@
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class YakimaHeraldRepublicRecipe(BasicNewsRecipe):
|
||||
title = u'Yakima Herald-Republic'
|
||||
description = 'The Yakima Herald-Republic.'
|
||||
language = 'en'
|
||||
__author__ = 'Laura Gjovaag'
|
||||
oldest_article = 1.5
|
||||
max_articles_per_feed = 100
|
||||
no_stylesheets = True
|
||||
remove_javascript = True
|
||||
keep_only_tags = [
|
||||
dict(name='div', attrs={'id':['searchleft', 'headline_credit']}),
|
||||
dict(name='div', attrs={'class':['photo', 'cauthor', 'photocredit']}),
|
||||
dict(name='div', attrs={'id':['content_body', 'footerleft']})
|
||||
]
|
||||
extra_css = '.cauthor {font: monospace 60%;} .photocredit {font: monospace 60%}'
|
||||
|
||||
feeds = [
|
||||
(u'Yakima Herald Online', u'http://feeds.feedburner.com/yhronlinenews'),
|
||||
]
|
33
resources/recipes/zerohedge.recipe
Normal file
33
resources/recipes/zerohedge.recipe
Normal file
@ -0,0 +1,33 @@
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2011, Darko Miletic <darko.miletic at gmail.com>'
|
||||
'''
|
||||
www.zerohedge.com
|
||||
'''
|
||||
|
||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||
|
||||
class ZeroHedge(BasicNewsRecipe):
|
||||
title = 'Zero Hedge'
|
||||
__author__ = 'Darko Miletic'
|
||||
description = 'On a long enough timeline the survival rate for everyone drops to zero'
|
||||
oldest_article = 10
|
||||
max_articles_per_feed = 100
|
||||
no_stylesheets = True
|
||||
use_embedded_content = True
|
||||
encoding = 'utf8'
|
||||
publisher = 'zero hedge'
|
||||
category = 'news, USA, world, economy, politics'
|
||||
language = 'en'
|
||||
masthead_url = 'http://www.zerohedge.com/themes/newsflash/logo.png'
|
||||
publication_type = 'blog'
|
||||
extra_css = 'body{ font-family: sans-serif }'
|
||||
|
||||
conversion_options = {
|
||||
'comments' : description
|
||||
,'tags' : category
|
||||
,'language' : language
|
||||
,'publisher': publisher
|
||||
}
|
||||
|
||||
|
||||
feeds = [(u'Articles', u'http://feeds.feedburner.com/zerohedge/feed')]
|
28
resources/template-functions.json
Normal file
28
resources/template-functions.json
Normal file
@ -0,0 +1,28 @@
|
||||
{
|
||||
"contains": "def evaluate(self, formatter, kwargs, mi, locals,\n val, test, value_if_present, value_if_not):\n if re.search(test, val):\n return value_if_present\n else:\n return value_if_not\n",
|
||||
"divide": "def evaluate(self, formatter, kwargs, mi, locals, x, y):\n x = float(x if x else 0)\n y = float(y if y else 0)\n return unicode(x / y)\n",
|
||||
"uppercase": "def evaluate(self, formatter, kwargs, mi, locals, val):\n return val.upper()\n",
|
||||
"strcat": "def evaluate(self, formatter, kwargs, mi, locals, *args):\n i = 0\n res = ''\n for i in range(0, len(args)):\n res += args[i]\n return res\n",
|
||||
"substr": "def evaluate(self, formatter, kwargs, mi, locals, str_, start_, end_):\n return str_[int(start_): len(str_) if int(end_) == 0 else int(end_)]\n",
|
||||
"ifempty": "def evaluate(self, formatter, kwargs, mi, locals, val, value_if_empty):\n if val:\n return val\n else:\n return value_if_empty\n",
|
||||
"field": "def evaluate(self, formatter, kwargs, mi, locals, name):\n return formatter.get_value(name, [], kwargs)\n",
|
||||
"capitalize": "def evaluate(self, formatter, kwargs, mi, locals, val):\n return capitalize(val)\n",
|
||||
"list_item": "def evaluate(self, formatter, kwargs, mi, locals, val, index, sep):\n if not val:\n return ''\n index = int(index)\n val = val.split(sep)\n try:\n return val[index]\n except:\n return ''\n",
|
||||
"shorten": "def evaluate(self, formatter, kwargs, mi, locals,\n val, leading, center_string, trailing):\n l = max(0, int(leading))\n t = max(0, int(trailing))\n if len(val) > l + len(center_string) + t:\n return val[0:l] + center_string + ('' if t == 0 else val[-t:])\n else:\n return val\n",
|
||||
"re": "def evaluate(self, formatter, kwargs, mi, locals, val, pattern, replacement):\n return re.sub(pattern, replacement, val)\n",
|
||||
"add": "def evaluate(self, formatter, kwargs, mi, locals, x, y):\n x = float(x if x else 0)\n y = float(y if y else 0)\n return unicode(x + y)\n",
|
||||
"lookup": "def evaluate(self, formatter, kwargs, mi, locals, val, *args):\n if len(args) == 2: # here for backwards compatibility\n if val:\n return formatter.vformat('{'+args[0].strip()+'}', [], kwargs)\n else:\n return formatter.vformat('{'+args[1].strip()+'}', [], kwargs)\n if (len(args) % 2) != 1:\n raise ValueError(_('lookup requires either 2 or an odd number of arguments'))\n i = 0\n while i < len(args):\n if i + 1 >= len(args):\n return formatter.vformat('{' + args[i].strip() + '}', [], kwargs)\n if re.search(args[i], val):\n return formatter.vformat('{'+args[i+1].strip() + '}', [], kwargs)\n i += 2\n",
|
||||
"template": "def evaluate(self, formatter, kwargs, mi, locals, template):\n template = template.replace('[[', '{').replace(']]', '}')\n return formatter.safe_format(template, kwargs, 'TEMPLATE', mi)\n",
|
||||
"print": "def evaluate(self, formatter, kwargs, mi, locals, *args):\n print args\n return None\n",
|
||||
"titlecase": "def evaluate(self, formatter, kwargs, mi, locals, val):\n return titlecase(val)\n",
|
||||
"test": "def evaluate(self, formatter, kwargs, mi, locals, val, value_if_set, value_not_set):\n if val:\n return value_if_set\n else:\n return value_not_set\n",
|
||||
"eval": "def evaluate(self, formatter, kwargs, mi, locals, template):\n from formatter import eval_formatter\n template = template.replace('[[', '{').replace(']]', '}')\n return eval_formatter.safe_format(template, locals, 'EVAL', None)\n",
|
||||
"multiply": "def evaluate(self, formatter, kwargs, mi, locals, x, y):\n x = float(x if x else 0)\n y = float(y if y else 0)\n return unicode(x * y)\n",
|
||||
"subtract": "def evaluate(self, formatter, kwargs, mi, locals, x, y):\n x = float(x if x else 0)\n y = float(y if y else 0)\n return unicode(x - y)\n",
|
||||
"count": "def evaluate(self, formatter, kwargs, mi, locals, val, sep):\n return unicode(len(val.split(sep)))\n",
|
||||
"lowercase": "def evaluate(self, formatter, kwargs, mi, locals, val):\n return val.lower()\n",
|
||||
"assign": "def evaluate(self, formatter, kwargs, mi, locals, target, value):\n locals[target] = value\n return value\n",
|
||||
"switch": "def evaluate(self, formatter, kwargs, mi, locals, val, *args):\n if (len(args) % 2) != 1:\n raise ValueError(_('switch requires an odd number of arguments'))\n i = 0\n while i < len(args):\n if i + 1 >= len(args):\n return args[i]\n if re.search(args[i], val):\n return args[i+1]\n i += 2\n",
|
||||
"strcmp": "def evaluate(self, formatter, kwargs, mi, locals, x, y, lt, eq, gt):\n v = strcmp(x, y)\n if v < 0:\n return lt\n if v == 0:\n return eq\n return gt\n",
|
||||
"cmp": "def evaluate(self, formatter, kwargs, mi, locals, x, y, lt, eq, gt):\n x = float(x if x else 0)\n y = float(y if y else 0)\n if x < y:\n return lt\n if x == y:\n return eq\n return gt\n"
|
||||
}
|
@ -287,7 +287,7 @@
|
||||
<xsl:value-of select="count(preceding::rtf:footnote) + 1"/>
|
||||
<xsl:text>]</xsl:text>
|
||||
</xsl:when>
|
||||
<xsl:when test="(@superscript = 'true')">
|
||||
<xsl:when test="(@superscript)">
|
||||
<xsl:element name="sup">
|
||||
<xsl:element name="span">
|
||||
<xsl:attribute name="class">
|
||||
@ -297,7 +297,7 @@
|
||||
</xsl:element>
|
||||
</xsl:element>
|
||||
</xsl:when>
|
||||
<xsl:when test="(@underscript = 'true')">
|
||||
<xsl:when test="(@underscript or @subscript)">
|
||||
<xsl:element name="sub">
|
||||
<xsl:element name="span">
|
||||
<xsl:attribute name="class">
|
||||
|
@ -41,6 +41,7 @@ function scroll_to_bookmark(bookmark) {
|
||||
$.scrollTo($(bm[0]), 1000,
|
||||
{
|
||||
over:ratio,
|
||||
axis: 'y', // Do not scroll in the x direction
|
||||
onAfter:function(){window.py_bridge.animated_scroll_done()}
|
||||
}
|
||||
);
|
||||
|
@ -117,11 +117,10 @@ if iswindows:
|
||||
poppler_inc_dirs = consolidate('POPPLER_INC_DIR',
|
||||
r'%s\poppler;%s'%(sw_inc_dir, sw_inc_dir))
|
||||
|
||||
popplerqt4_inc_dirs = poppler_inc_dirs + [poppler_inc_dirs[1]+r'\qt4']
|
||||
poppler_lib_dirs = consolidate('POPPLER_LIB_DIR', sw_lib_dir)
|
||||
popplerqt4_lib_dirs = poppler_lib_dirs
|
||||
poppler_libs = ['poppler']
|
||||
magick_inc_dirs = [os.path.join(prefix, 'build', 'ImageMagick-6.5.6')]
|
||||
magick_inc_dirs = [os.path.join(prefix, 'build', 'ImageMagick-6.6.6')]
|
||||
magick_lib_dirs = [os.path.join(magick_inc_dirs[0], 'VisualMagick', 'lib')]
|
||||
magick_libs = ['CORE_RL_wand_', 'CORE_RL_magick_']
|
||||
podofo_inc = os.path.join(sw_inc_dir, 'podofo')
|
||||
@ -131,7 +130,6 @@ elif isosx:
|
||||
fc_lib = '/sw/lib'
|
||||
poppler_inc_dirs = consolidate('POPPLER_INC_DIR',
|
||||
'/sw/build/poppler-0.14.5/poppler:/sw/build/poppler-0.14.5')
|
||||
popplerqt4_inc_dirs = poppler_inc_dirs + [poppler_inc_dirs[0]+'/qt4']
|
||||
poppler_lib_dirs = consolidate('POPPLER_LIB_DIR',
|
||||
'/sw/lib')
|
||||
poppler_libs = ['poppler']
|
||||
@ -150,9 +148,6 @@ else:
|
||||
# Include directories
|
||||
poppler_inc_dirs = pkgconfig_include_dirs('poppler',
|
||||
'POPPLER_INC_DIR', '/usr/include/poppler')
|
||||
popplerqt4_inc_dirs = pkgconfig_include_dirs('poppler-qt4', '', '')
|
||||
if not popplerqt4_inc_dirs:
|
||||
popplerqt4_inc_dirs = poppler_inc_dirs + [poppler_inc_dirs[0]+'/qt4']
|
||||
png_inc_dirs = pkgconfig_include_dirs('libpng', 'PNG_INC_DIR',
|
||||
'/usr/include')
|
||||
magick_inc_dirs = pkgconfig_include_dirs('MagickWand', 'MAGICK_INC', '/usr/include/ImageMagick')
|
||||
@ -187,20 +182,17 @@ if not poppler_inc_dirs or not os.path.exists(
|
||||
poppler_error = \
|
||||
('Poppler not found on your system. Various PDF related',
|
||||
' functionality will not work. Use the POPPLER_INC_DIR and',
|
||||
' POPPLER_LIB_DIR environment variables.')
|
||||
|
||||
popplerqt4_error = None
|
||||
if not popplerqt4_inc_dirs or not os.path.exists(
|
||||
os.path.join(popplerqt4_inc_dirs[-1], 'poppler-qt4.h')):
|
||||
popplerqt4_error = \
|
||||
('Poppler Qt4 bindings not found on your system.')
|
||||
' POPPLER_LIB_DIR environment variables. calibre requires '
|
||||
' the poppler XPDF headers. If your distro does not '
|
||||
' include them you will have to re-compile poppler '
|
||||
' by hand with --enable-xpdf-headers')
|
||||
|
||||
magick_error = None
|
||||
if not magick_inc_dirs or not os.path.exists(os.path.join(magick_inc_dirs[0],
|
||||
'wand')):
|
||||
magick_error = ('ImageMagick not found on your system. '
|
||||
'Try setting the environment variables MAGICK_INC '
|
||||
'and MAGICK_LIB to help calibre locate the inclue and libbrary '
|
||||
'and MAGICK_LIB to help calibre locate the include and library '
|
||||
'files.')
|
||||
|
||||
podofo_lib = os.environ.get('PODOFO_LIB_DIR', podofo_lib)
|
||||
|
@ -612,8 +612,13 @@ class Py2App(object):
|
||||
dmg = os.path.join(destdir, volname+'.dmg')
|
||||
if os.path.exists(dmg):
|
||||
os.unlink(dmg)
|
||||
subprocess.check_call(['/usr/bin/hdiutil', 'create', '-srcfolder', os.path.abspath(d),
|
||||
tdir = tempfile.mkdtemp()
|
||||
shutil.copytree(d, os.path.join(tdir, os.path.basename(d)),
|
||||
symlinks=True)
|
||||
os.symlink('/Applications', os.path.join(tdir, 'Applications'))
|
||||
subprocess.check_call(['/usr/bin/hdiutil', 'create', '-srcfolder', tdir,
|
||||
'-volname', volname, '-format', format, dmg])
|
||||
shutil.rmtree(tdir)
|
||||
if internet_enable:
|
||||
subprocess.check_call(['/usr/bin/hdiutil', 'internet-enable', '-yes', dmg])
|
||||
size = os.stat(dmg).st_size/(1024*1024.)
|
||||
|
@ -18,7 +18,7 @@ QT_DLLS = ['Core', 'Gui', 'Network', 'Svg', 'WebKit', 'Xml', 'XmlPatterns']
|
||||
LIBUSB_DIR = 'C:\\libusb'
|
||||
LIBUNRAR = 'C:\\Program Files\\UnrarDLL\\unrar.dll'
|
||||
SW = r'C:\cygwin\home\kovid\sw'
|
||||
IMAGEMAGICK = os.path.join(SW, 'build', 'ImageMagick-6.5.6',
|
||||
IMAGEMAGICK = os.path.join(SW, 'build', 'ImageMagick-6.6.6',
|
||||
'VisualMagick', 'bin')
|
||||
|
||||
VERSION = re.sub('[a-z]\d+', '', __version__)
|
||||
|
@ -301,12 +301,14 @@ int projectType = MULTITHREADEDDLL;
|
||||
|
||||
Run configure.bat in a visual studio command prompt
|
||||
|
||||
Run configure.exe generated by configure.bat
|
||||
|
||||
Edit magick/magick-config.h
|
||||
|
||||
Undefine ProvideDllMain and MAGICKCORE_X11_DELEGATE
|
||||
|
||||
Now open VisualMagick/VisualDynamicMT.sln set to Release
|
||||
Remove the CORE_xlib project
|
||||
Remove the CORE_xlib and UTIL_Imdisplay project CORE_Magick++
|
||||
|
||||
calibre
|
||||
---------
|
||||
|
@ -84,6 +84,23 @@ class Resources(Command):
|
||||
|
||||
cPickle.dump(complete, open(dest, 'wb'), -1)
|
||||
|
||||
self.info('\tCreating template-functions.json')
|
||||
dest = self.j(self.RESOURCES, 'template-functions.json')
|
||||
function_dict = {}
|
||||
import inspect
|
||||
from calibre.utils.formatter_functions import all_builtin_functions
|
||||
for obj in all_builtin_functions:
|
||||
eval_func = inspect.getmembers(obj,
|
||||
lambda x: inspect.ismethod(x) and x.__name__ == 'evaluate')
|
||||
try:
|
||||
lines = [l[4:] for l in inspect.getsourcelines(eval_func[0][1])[0]]
|
||||
except:
|
||||
continue
|
||||
lines = ''.join(lines)
|
||||
function_dict[obj.name] = lines
|
||||
import json
|
||||
json.dump(function_dict, open(dest, 'wb'), indent=4)
|
||||
|
||||
def clean(self):
|
||||
for x in ('scripts', 'recipes', 'ebook-convert-complete'):
|
||||
x = self.j(self.RESOURCES, x+'.pickle')
|
||||
|
@ -6,7 +6,7 @@ __license__ = 'GPL v3'
|
||||
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import os, re, cStringIO, base64, httplib, subprocess, hashlib, shutil
|
||||
import os, re, cStringIO, base64, httplib, subprocess, hashlib, shutil, time
|
||||
from subprocess import check_call
|
||||
from tempfile import NamedTemporaryFile, mkdtemp
|
||||
|
||||
@ -160,7 +160,7 @@ class UploadToGoogleCode(Command):
|
||||
|
||||
return 'multipart/form-data; boundary=%s' % BOUNDARY, CRLF.join(body)
|
||||
|
||||
def upload(self, fname, desc, labels=[]):
|
||||
def upload(self, fname, desc, labels=[], retry=0):
|
||||
form_fields = [('summary', desc)]
|
||||
form_fields.extend([('label', l.strip()) for l in labels])
|
||||
|
||||
@ -183,6 +183,10 @@ class UploadToGoogleCode(Command):
|
||||
|
||||
print 'Failed to upload with code %d and reason: %s'%(resp.status,
|
||||
resp.reason)
|
||||
if retry < 1:
|
||||
print 'Retrying in 5 seconds....'
|
||||
time.sleep(5)
|
||||
return self.upload(fname, desc, labels=labels, retry=retry+1)
|
||||
raise Exception('Failed to upload '+fname)
|
||||
|
||||
|
||||
|
@ -254,7 +254,7 @@ def browser(honor_time=True, max_time=2, mobile_browser=False):
|
||||
opener.set_handle_refresh(True, max_time=max_time, honor_time=honor_time)
|
||||
opener.set_handle_robots(False)
|
||||
opener.addheaders = [('User-agent', ' Mozilla/5.0 (Windows; U; Windows CE 5.1; rv:1.8.1a3) Gecko/20060610 Minimo/0.016' if mobile_browser else \
|
||||
'Mozilla/5.0 (X11; U; i686 Linux; en_US; rv:1.8.0.4) Gecko/20060508 Firefox/1.5.0.4')]
|
||||
'Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.2.13) Gecko/20101210 Gentoo Firefox/3.6.13')]
|
||||
http_proxy = get_proxies().get('http', None)
|
||||
if http_proxy:
|
||||
opener.set_proxies({'http':http_proxy})
|
||||
@ -459,6 +459,18 @@ def force_unicode(obj, enc=preferred_encoding):
|
||||
obj = obj.decode('utf-8')
|
||||
return obj
|
||||
|
||||
def as_unicode(obj, enc=preferred_encoding):
|
||||
if not isbytestring(obj):
|
||||
try:
|
||||
obj = unicode(obj)
|
||||
except:
|
||||
try:
|
||||
obj = str(obj)
|
||||
except:
|
||||
obj = repr(obj)
|
||||
return force_unicode(obj, enc=enc)
|
||||
|
||||
|
||||
|
||||
def human_readable(size):
|
||||
""" Convert a size in bytes into a human readable form """
|
||||
|
@ -2,7 +2,7 @@ __license__ = 'GPL v3'
|
||||
__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
__appname__ = 'calibre'
|
||||
__version__ = '0.7.35'
|
||||
__version__ = '0.7.40'
|
||||
__author__ = "Kovid Goyal <kovid@kovidgoyal.net>"
|
||||
|
||||
import re
|
||||
|
@ -80,6 +80,100 @@ class Plugin(object): # {{{
|
||||
'''
|
||||
pass
|
||||
|
||||
def config_widget(self):
|
||||
'''
|
||||
Implement this method and :meth:`save_settings` in your plugin to
|
||||
use a custom configuration dialog, rather then relying on the simple
|
||||
string based default customization.
|
||||
|
||||
This method, if implemented, must return a QWidget. The widget can have
|
||||
an optional method validate() that takes no arguments and is called
|
||||
immediately after the user clicks OK. Changes are applied if and only
|
||||
if the method returns True.
|
||||
'''
|
||||
raise NotImplementedError()
|
||||
|
||||
def save_settings(self, config_widget):
|
||||
'''
|
||||
Save the settings specified by the user with config_widget.
|
||||
|
||||
:param config_widget: The widget returned by :meth:`config_widget`.
|
||||
|
||||
'''
|
||||
raise NotImplementedError()
|
||||
|
||||
def do_user_config(self, parent=None):
|
||||
'''
|
||||
This method shows a configuration dialog for this plugin. It returns
|
||||
True if the user clicks OK, False otherwise. The changes are
|
||||
automatically applied.
|
||||
'''
|
||||
from PyQt4.Qt import QDialog, QDialogButtonBox, QVBoxLayout, \
|
||||
QLabel, Qt, QLineEdit
|
||||
from calibre.gui2 import gprefs
|
||||
|
||||
prefname = 'plugin config dialog:'+self.type + ':' + self.name
|
||||
geom = gprefs.get(prefname, None)
|
||||
|
||||
config_dialog = QDialog(parent)
|
||||
button_box = QDialogButtonBox(QDialogButtonBox.Ok | QDialogButtonBox.Cancel)
|
||||
v = QVBoxLayout(config_dialog)
|
||||
|
||||
def size_dialog():
|
||||
if geom is None:
|
||||
config_dialog.resize(config_dialog.sizeHint())
|
||||
else:
|
||||
config_dialog.restoreGeometry(geom)
|
||||
|
||||
button_box.accepted.connect(config_dialog.accept)
|
||||
button_box.rejected.connect(config_dialog.reject)
|
||||
config_dialog.setWindowTitle(_('Customize') + ' ' + self.name)
|
||||
try:
|
||||
config_widget = self.config_widget()
|
||||
except NotImplementedError:
|
||||
config_widget = None
|
||||
|
||||
if config_widget is not None:
|
||||
v.addWidget(config_widget)
|
||||
v.addWidget(button_box)
|
||||
size_dialog()
|
||||
config_dialog.exec_()
|
||||
|
||||
if config_dialog.result() == QDialog.Accepted:
|
||||
if hasattr(config_widget, 'validate'):
|
||||
if config_widget.validate():
|
||||
self.save_settings(config_widget)
|
||||
else:
|
||||
self.save_settings(config_widget)
|
||||
else:
|
||||
from calibre.customize.ui import plugin_customization, \
|
||||
customize_plugin
|
||||
help_text = self.customization_help(gui=True)
|
||||
help_text = QLabel(help_text, config_dialog)
|
||||
help_text.setWordWrap(True)
|
||||
help_text.setTextInteractionFlags(Qt.LinksAccessibleByMouse
|
||||
| Qt.LinksAccessibleByKeyboard)
|
||||
help_text.setOpenExternalLinks(True)
|
||||
v.addWidget(help_text)
|
||||
sc = plugin_customization(self)
|
||||
if not sc:
|
||||
sc = ''
|
||||
sc = sc.strip()
|
||||
sc = QLineEdit(sc, config_dialog)
|
||||
v.addWidget(sc)
|
||||
v.addWidget(button_box)
|
||||
size_dialog()
|
||||
config_dialog.exec_()
|
||||
|
||||
if config_dialog.result() == QDialog.Accepted:
|
||||
sc = unicode(sc.text()).strip()
|
||||
customize_plugin(self, sc)
|
||||
|
||||
geom = bytearray(config_dialog.saveGeometry())
|
||||
gprefs[prefname] = geom
|
||||
|
||||
return config_dialog.result()
|
||||
|
||||
def load_resources(self, names):
|
||||
'''
|
||||
If this plugin comes in a ZIP file (user added plugin), this method
|
||||
@ -307,6 +401,14 @@ class CatalogPlugin(Plugin): # {{{
|
||||
#: cli_options parsed in library.cli:catalog_option_parser()
|
||||
cli_options = []
|
||||
|
||||
def _field_sorter(self, key):
|
||||
'''
|
||||
Custom fields sort after standard fields
|
||||
'''
|
||||
if key.startswith('#'):
|
||||
return '~%s' % key[1:]
|
||||
else:
|
||||
return key
|
||||
|
||||
def search_sort_db(self, db, opts):
|
||||
|
||||
@ -315,18 +417,18 @@ class CatalogPlugin(Plugin): # {{{
|
||||
if opts.sort_by:
|
||||
# 2nd arg = ascending
|
||||
db.sort(opts.sort_by, True)
|
||||
|
||||
return db.get_data_as_dict(ids=opts.ids)
|
||||
|
||||
def get_output_fields(self, opts):
|
||||
def get_output_fields(self, db, opts):
|
||||
# Return a list of requested fields, with opts.sort_by first
|
||||
all_fields = set(
|
||||
all_std_fields = set(
|
||||
['author_sort','authors','comments','cover','formats',
|
||||
'id','isbn','ondevice','pubdate','publisher','rating',
|
||||
'series_index','series','size','tags','timestamp',
|
||||
'title','uuid'])
|
||||
all_custom_fields = set(db.custom_field_keys())
|
||||
all_fields = all_std_fields.union(all_custom_fields)
|
||||
|
||||
fields = all_fields
|
||||
if opts.fields != 'all':
|
||||
# Make a list from opts.fields
|
||||
requested_fields = set(opts.fields.split(','))
|
||||
@ -337,7 +439,7 @@ class CatalogPlugin(Plugin): # {{{
|
||||
if not opts.connected_device['is_device_connected'] and 'ondevice' in fields:
|
||||
fields.pop(int(fields.index('ondevice')))
|
||||
|
||||
fields.sort()
|
||||
fields = sorted(fields, key=self._field_sorter)
|
||||
if opts.sort_by and opts.sort_by in fields:
|
||||
fields.insert(0,fields.pop(int(fields.index(opts.sort_by))))
|
||||
return fields
|
||||
|
@ -478,7 +478,7 @@ from calibre.devices.teclast.driver import TECLAST_K3, NEWSMY, IPAPYRUS, \
|
||||
from calibre.devices.sne.driver import SNE
|
||||
from calibre.devices.misc import PALMPRE, AVANT, SWEEX, PDNOVEL, KOGAN, \
|
||||
GEMEI, VELOCITYMICRO, PDNOVEL_KOBO, Q600, LUMIREAD, ALURATEK_COLOR, \
|
||||
TREKSTOR, EEEREADER
|
||||
TREKSTOR, EEEREADER, NEXTBOOK
|
||||
from calibre.devices.folder_device.driver import FOLDER_DEVICE_FOR_CONFIG
|
||||
from calibre.devices.kobo.driver import KOBO
|
||||
from calibre.devices.bambook.driver import BAMBOOK
|
||||
@ -606,6 +606,7 @@ plugins += [
|
||||
BAMBOOK,
|
||||
TREKSTOR,
|
||||
EEEREADER,
|
||||
NEXTBOOK,
|
||||
ITUNES,
|
||||
]
|
||||
plugins += [x for x in list(locals().values()) if isinstance(x, type) and \
|
||||
@ -704,13 +705,17 @@ class ActionTweakEpub(InterfaceActionBase):
|
||||
name = 'Tweak ePub'
|
||||
actual_plugin = 'calibre.gui2.actions.tweak_epub:TweakEpubAction'
|
||||
|
||||
class ActionNextMatch(InterfaceActionBase):
|
||||
name = 'Next Match'
|
||||
actual_plugin = 'calibre.gui2.actions.next_match:NextMatchAction'
|
||||
|
||||
plugins += [ActionAdd, ActionFetchAnnotations, ActionGenerateCatalog,
|
||||
ActionConvert, ActionDelete, ActionEditMetadata, ActionView,
|
||||
ActionFetchNews, ActionSaveToDisk, ActionShowBookDetails,
|
||||
ActionRestart, ActionOpenFolder, ActionConnectShare,
|
||||
ActionSendToDevice, ActionHelp, ActionPreferences, ActionSimilarBooks,
|
||||
ActionAddToLibrary, ActionEditCollections, ActionChooseLibrary,
|
||||
ActionCopyToLibrary, ActionTweakEpub]
|
||||
ActionCopyToLibrary, ActionTweakEpub, ActionNextMatch]
|
||||
|
||||
# }}}
|
||||
|
||||
@ -842,6 +847,17 @@ class Plugboard(PreferencesPlugin):
|
||||
config_widget = 'calibre.gui2.preferences.plugboard'
|
||||
description = _('Change metadata fields before saving/sending')
|
||||
|
||||
class TemplateFunctions(PreferencesPlugin):
|
||||
name = 'TemplateFunctions'
|
||||
icon = I('template_funcs.png')
|
||||
gui_name = _('Template Functions')
|
||||
category = 'Advanced'
|
||||
gui_category = _('Advanced')
|
||||
category_order = 5
|
||||
name_order = 4
|
||||
config_widget = 'calibre.gui2.preferences.template_functions'
|
||||
description = _('Create your own template functions')
|
||||
|
||||
class Email(PreferencesPlugin):
|
||||
name = 'Email'
|
||||
icon = I('mail.png')
|
||||
@ -903,6 +919,6 @@ class Misc(PreferencesPlugin):
|
||||
|
||||
plugins += [LookAndFeel, Behavior, Columns, Toolbar, InputOptions,
|
||||
CommonOptions, OutputOptions, Adding, Saving, Sending, Plugboard,
|
||||
Email, Server, Plugins, Tweaks, Misc]
|
||||
Email, Server, Plugins, Tweaks, Misc, TemplateFunctions]
|
||||
|
||||
#}}}
|
||||
|
@ -439,6 +439,13 @@ class TabletOutput(iPadOutput):
|
||||
screen_size = (sys.maxint, sys.maxint)
|
||||
comic_screen_size = (sys.maxint, sys.maxint)
|
||||
|
||||
class SamsungGalaxy(TabletOutput):
|
||||
name = 'Samsung Galaxy'
|
||||
short_name = 'galaxy'
|
||||
description = _('Intended for the Samsung Galaxy and similar tablet devices with '
|
||||
'a resolution of 600x1280')
|
||||
screen_size = comic_screen_size = (600, 1280)
|
||||
|
||||
class SonyReaderOutput(OutputProfile):
|
||||
|
||||
name = 'Sony Reader'
|
||||
@ -617,6 +624,8 @@ class KindleDXOutput(OutputProfile):
|
||||
#comic_screen_size = (741, 1022)
|
||||
supports_mobi_indexing = True
|
||||
periodical_date_in_title = False
|
||||
missing_char = u'x\u2009'
|
||||
empty_ratings_char = u'\u2606'
|
||||
ratings_char = u'\u2605'
|
||||
read_char = u'\u2713'
|
||||
mobi_ems_per_blockquote = 2.0
|
||||
@ -707,7 +716,7 @@ class BambookOutput(OutputProfile):
|
||||
output_profiles = [OutputProfile, SonyReaderOutput, SonyReader300Output,
|
||||
SonyReader900Output, MSReaderOutput, MobipocketOutput, HanlinV3Output,
|
||||
HanlinV5Output, CybookG3Output, CybookOpusOutput, KindleOutput,
|
||||
iPadOutput, KoboReaderOutput, TabletOutput,
|
||||
iPadOutput, KoboReaderOutput, TabletOutput, SamsungGalaxy,
|
||||
SonyReaderLandscapeOutput, KindleDXOutput, IlliadOutput,
|
||||
IRexDR1000Output, IRexDR800Output, JetBook5Output, NookOutput,
|
||||
BambookOutput, NookColorOutput]
|
||||
|
@ -27,15 +27,16 @@ class ANDROID(USBMS):
|
||||
0x040d : { 0x8510 : [0x0001], 0x0851 : [0x1] },
|
||||
|
||||
# Motorola
|
||||
0x22b8 : { 0x41d9 : [0x216], 0x2d67 : [0x100], 0x41db : [0x216],
|
||||
0x4285 : [0x216], 0x42a3 : [0x216] },
|
||||
0x22b8 : { 0x41d9 : [0x216], 0x2d61 : [0x100], 0x2d67 : [0x100],
|
||||
0x41db : [0x216], 0x4285 : [0x216], 0x42a3 : [0x216],
|
||||
0x4286 : [0x216], 0x42b3 : [0x216] },
|
||||
|
||||
# Sony Ericsson
|
||||
0xfce : { 0xd12e : [0x0100]},
|
||||
|
||||
# Google
|
||||
0x18d1 : { 0x4e11 : [0x0100, 0x226, 0x227], 0x4e12: [0x0100, 0x226,
|
||||
0x227]},
|
||||
0x227], 0x4e21: [0x0100, 0x226, 0x227]},
|
||||
|
||||
# Samsung
|
||||
0x04e8 : { 0x681d : [0x0222, 0x0223, 0x0224, 0x0400],
|
||||
@ -52,6 +53,9 @@ class ANDROID(USBMS):
|
||||
# LG
|
||||
0x1004 : { 0x61cc : [0x100] },
|
||||
|
||||
# Archos
|
||||
0x0e79 : { 0x1420 : [0x0216]},
|
||||
|
||||
}
|
||||
EBOOK_DIR_MAIN = ['eBooks/import', 'wordplayer/calibretransfer', 'Books']
|
||||
EXTRA_CUSTOMIZATION_MESSAGE = _('Comma separated list of directories to '
|
||||
@ -60,17 +64,19 @@ class ANDROID(USBMS):
|
||||
EXTRA_CUSTOMIZATION_DEFAULT = ', '.join(EBOOK_DIR_MAIN)
|
||||
|
||||
VENDOR_NAME = ['HTC', 'MOTOROLA', 'GOOGLE_', 'ANDROID', 'ACER',
|
||||
'GT-I5700', 'SAMSUNG', 'DELL', 'LINUX', 'GOOGLE']
|
||||
'GT-I5700', 'SAMSUNG', 'DELL', 'LINUX', 'GOOGLE', 'ARCHOS']
|
||||
WINDOWS_MAIN_MEM = ['ANDROID_PHONE', 'A855', 'A853', 'INC.NEXUS_ONE',
|
||||
'__UMS_COMPOSITE', '_MB200', 'MASS_STORAGE', '_-_CARD', 'SGH-I897',
|
||||
'GT-I9000', 'FILE-STOR_GADGET', 'SGH-T959', 'SAMSUNG_ANDROID',
|
||||
'SCH-I500_CARD', 'SPH-D700_CARD', 'MB810', 'GT-P1000', 'DESIRE']
|
||||
'SCH-I500_CARD', 'SPH-D700_CARD', 'MB810', 'GT-P1000', 'DESIRE',
|
||||
'SGH-T849', '_MB300', 'A70S']
|
||||
WINDOWS_CARD_A_MEM = ['ANDROID_PHONE', 'GT-I9000_CARD', 'SGH-I897',
|
||||
'FILE-STOR_GADGET', 'SGH-T959', 'SAMSUNG_ANDROID', 'GT-P1000_CARD']
|
||||
'FILE-STOR_GADGET', 'SGH-T959', 'SAMSUNG_ANDROID', 'GT-P1000_CARD',
|
||||
'A70S']
|
||||
|
||||
OSX_MAIN_MEM = 'HTC Android Phone Media'
|
||||
OSX_MAIN_MEM = 'Android Device Main Memory'
|
||||
|
||||
MAIN_MEMORY_VOLUME_LABEL = 'Android Phone Internal Memory'
|
||||
MAIN_MEMORY_VOLUME_LABEL = 'Android Device Main Memory'
|
||||
|
||||
SUPPORTS_SUB_DIRS = True
|
||||
|
||||
|
@ -29,12 +29,16 @@ class BAMBOOK(DeviceConfig, DevicePlugin):
|
||||
booklist_class = BookList
|
||||
book_class = Book
|
||||
|
||||
ip = None
|
||||
|
||||
FORMATS = [ "snb" ]
|
||||
VENDOR_ID = 0x230b
|
||||
PRODUCT_ID = 0x0001
|
||||
BCD = None
|
||||
CAN_SET_METADATA = False
|
||||
THUMBNAIL_HEIGHT = 155
|
||||
EXTRA_CUSTOMIZATION_MESSAGE = \
|
||||
_("Device IP Address (restart calibre after changing)")
|
||||
|
||||
icon = I("devices/bambook.png")
|
||||
# OPEN_FEEDBACK_MESSAGE = _(
|
||||
@ -47,6 +51,10 @@ class BAMBOOK(DeviceConfig, DevicePlugin):
|
||||
METADATA_FILE_GUID = 'calibremetadata.snb'
|
||||
|
||||
bambook = None
|
||||
is_connected = False
|
||||
|
||||
def __init__(self, ip):
|
||||
self.ip = ip
|
||||
|
||||
def reset(self, key='-1', log_packets=False, report_progress=None,
|
||||
detected_device=None) :
|
||||
@ -60,15 +68,23 @@ class BAMBOOK(DeviceConfig, DevicePlugin):
|
||||
self.eject()
|
||||
# Connect
|
||||
self.bambook = Bambook()
|
||||
self.bambook.Connect()
|
||||
self.bambook.Connect(ip = self.ip, timeout = 10000)
|
||||
if self.bambook.GetState() != CONN_CONNECTED:
|
||||
self.bambook = None
|
||||
raise Exception(_("Unable to connect to Bambook."))
|
||||
raise OpenFeedback(_("Unable to connect to Bambook. \n"
|
||||
"If you are trying to connect via Wi-Fi, "
|
||||
"please make sure the IP address of Bambook has been correctly configured."))
|
||||
self.is_connected = True
|
||||
return True
|
||||
|
||||
def unmount_device(self):
|
||||
self.eject()
|
||||
|
||||
def eject(self):
|
||||
if self.bambook:
|
||||
self.bambook.Disconnect()
|
||||
self.bambook = None
|
||||
self.is_connected = False
|
||||
|
||||
def post_yank_cleanup(self):
|
||||
self.eject()
|
||||
@ -475,3 +491,8 @@ class BAMBOOK(DeviceConfig, DevicePlugin):
|
||||
def get_guid(uuid):
|
||||
guid = hashlib.md5(uuid).hexdigest()[0:15] + ".snb"
|
||||
return guid
|
||||
|
||||
class BAMBOOKWifi(BAMBOOK):
|
||||
def is_usb_connected(self, devices_on_system, debug=False,
|
||||
only_presence=False):
|
||||
return self.is_connected, self
|
||||
|
@ -329,6 +329,8 @@ class Bambook:
|
||||
self.handle = None
|
||||
|
||||
def Connect(self, ip = DEFAULT_BAMBOOK_IP, timeout = 10000):
|
||||
if ip == None or ip == '':
|
||||
ip = DEFAULT_BAMBOOK_IP
|
||||
self.handle = BambookConnect(ip, timeout)
|
||||
if self.handle and self.handle != 0:
|
||||
return True
|
||||
|
@ -18,9 +18,9 @@ class FOLDER_DEVICE_FOR_CONFIG(USBMS):
|
||||
supported_platforms = ['windows', 'osx', 'linux']
|
||||
FORMATS = ['epub', 'fb2', 'mobi', 'azw', 'lrf', 'tcr', 'pmlz', 'lit',
|
||||
'rtf', 'rb', 'pdf', 'oeb', 'txt', 'pdb', 'prc']
|
||||
VENDOR_ID = 0xffff
|
||||
PRODUCT_ID = 0xffff
|
||||
BCD = 0xffff
|
||||
VENDOR_ID = [0xffff]
|
||||
PRODUCT_ID = [0xffff]
|
||||
BCD = [0xffff]
|
||||
DEVICE_PLUGBOARD_NAME = 'FOLDER_DEVICE'
|
||||
|
||||
|
||||
@ -34,9 +34,9 @@ class FOLDER_DEVICE(USBMS):
|
||||
supported_platforms = ['windows', 'osx', 'linux']
|
||||
FORMATS = FOLDER_DEVICE_FOR_CONFIG.FORMATS
|
||||
|
||||
VENDOR_ID = 0xffff
|
||||
PRODUCT_ID = 0xffff
|
||||
BCD = 0xffff
|
||||
VENDOR_ID = [0xffff]
|
||||
PRODUCT_ID = [0xffff]
|
||||
BCD = [0xffff]
|
||||
DEVICE_PLUGBOARD_NAME = 'FOLDER_DEVICE'
|
||||
|
||||
THUMBNAIL_HEIGHT = 68 # Height for thumbnails on device
|
||||
|
@ -20,11 +20,11 @@ class IRIVER_STORY(USBMS):
|
||||
FORMATS = ['epub', 'fb2', 'pdf', 'djvu', 'txt']
|
||||
|
||||
VENDOR_ID = [0x1006]
|
||||
PRODUCT_ID = [0x4023, 0x4025]
|
||||
PRODUCT_ID = [0x4023, 0x4024, 0x4025]
|
||||
BCD = [0x0323]
|
||||
|
||||
VENDOR_NAME = 'IRIVER'
|
||||
WINDOWS_MAIN_MEM = ['STORY', 'STORY_EB05']
|
||||
WINDOWS_MAIN_MEM = ['STORY', 'STORY_EB05', 'STORY_WI-FI']
|
||||
WINDOWS_CARD_A_MEM = ['STORY', 'STORY_SD']
|
||||
|
||||
#OSX_MAIN_MEM = 'Kindle Internal Storage Media'
|
||||
|
@ -27,7 +27,7 @@ class Book(Book_):
|
||||
|
||||
self.size = size # will be set later if None
|
||||
|
||||
if ContentType == '6':
|
||||
if ContentType == '6' and date is not None:
|
||||
self.datetime = time.strptime(date, "%Y-%m-%dT%H:%M:%S.%f")
|
||||
else:
|
||||
try:
|
||||
|
@ -33,7 +33,7 @@ class KOBO(USBMS):
|
||||
booklist_class = CollectionsBookList
|
||||
|
||||
# Ordered list of supported formats
|
||||
FORMATS = ['epub', 'pdf']
|
||||
FORMATS = ['epub', 'pdf', 'txt', 'cbz', 'cbr']
|
||||
CAN_SET_METADATA = ['collections']
|
||||
|
||||
VENDOR_ID = [0x2237]
|
||||
@ -409,7 +409,7 @@ class KOBO(USBMS):
|
||||
else:
|
||||
ContentType = 901
|
||||
else: # if extension == '.html' or extension == '.txt':
|
||||
ContentType = 999 # Yet another hack: to get around Kobo changing how ContentID is stored
|
||||
ContentType = 901 # Yet another hack: to get around Kobo changing how ContentID is stored
|
||||
return ContentType
|
||||
|
||||
def path_from_contentid(self, ContentID, ContentType, MimeType, oncard):
|
||||
|
@ -259,8 +259,28 @@ class EEEREADER(USBMS):
|
||||
PRODUCT_ID = [0x178f]
|
||||
BCD = [0x0319]
|
||||
|
||||
EBOOK_DIR_MAIN = 'Books'
|
||||
EBOOK_DIR_MAIN = EBOOK_DIR_CARD_A = 'Book'
|
||||
|
||||
VENDOR_NAME = 'LINUX'
|
||||
WINDOWS_MAIN_MEM = WINDOWS_CARD_A_MEM = 'FILE-STOR_GADGET'
|
||||
|
||||
class NEXTBOOK(USBMS):
|
||||
|
||||
name = 'Nextbook device interface'
|
||||
gui_name = 'Nextbook'
|
||||
description = _('Communicate with the Nextbook Reader')
|
||||
author = 'Kovid Goyal'
|
||||
supported_platforms = ['windows', 'osx', 'linux']
|
||||
|
||||
# Ordered list of supported formats
|
||||
FORMATS = ['epub', 'fb2', 'txt', 'pdf']
|
||||
|
||||
VENDOR_ID = [0x05e3]
|
||||
PRODUCT_ID = [0x0726]
|
||||
BCD = [0x021a]
|
||||
|
||||
EBOOK_DIR_MAIN = ''
|
||||
|
||||
VENDOR_NAME = 'NEXT2'
|
||||
WINDOWS_MAIN_MEM = '1.0.14'
|
||||
|
||||
|
@ -91,3 +91,19 @@ class NOOK_COLOR(NOOK):
|
||||
|
||||
EBOOK_DIR_MAIN = 'My Files/Books'
|
||||
|
||||
'''
|
||||
def create_upload_path(self, path, mdata, fname, create_dirs=True):
|
||||
filepath = NOOK.create_upload_path(self, path, mdata, fname,
|
||||
create_dirs=create_dirs)
|
||||
edm = self.EBOOK_DIR_MAIN.replace('/', os.sep)
|
||||
npath = os.path.join(edm, _('News')) + os.sep
|
||||
if npath in filepath:
|
||||
filepath = filepath.replace(npath, os.sep.join('My Files',
|
||||
'Magazines')+os.sep)
|
||||
filedir = os.path.dirname(filepath)
|
||||
if create_dirs and not os.path.exists(filedir):
|
||||
os.makedirs(filedir)
|
||||
|
||||
return filepath
|
||||
'''
|
||||
|
||||
|
@ -61,14 +61,37 @@ class PRS505(USBMS):
|
||||
ALL_BY_TITLE = _('All by title')
|
||||
ALL_BY_AUTHOR = _('All by author')
|
||||
|
||||
EXTRA_CUSTOMIZATION_MESSAGE = _('Comma separated list of metadata fields '
|
||||
EXTRA_CUSTOMIZATION_MESSAGE = [
|
||||
_('Comma separated list of metadata fields '
|
||||
'to turn into collections on the device. Possibilities include: ')+\
|
||||
'series, tags, authors' +\
|
||||
_('. Two special collections are available: %s:%s and %s:%s. Add '
|
||||
'these values to the list to enable them. The collections will be '
|
||||
'given the name provided after the ":" character.')%(
|
||||
'abt', ALL_BY_TITLE, 'aba', ALL_BY_AUTHOR)
|
||||
EXTRA_CUSTOMIZATION_DEFAULT = ', '.join(['series', 'tags'])
|
||||
'abt', ALL_BY_TITLE, 'aba', ALL_BY_AUTHOR),
|
||||
_('Upload separate cover thumbnails for books (newer readers)') +
|
||||
':::'+_('Normally, the SONY readers get the cover image from the'
|
||||
' ebook file itself. With this option, calibre will send a '
|
||||
'separate cover image to the reader, useful if you are '
|
||||
'sending DRMed books in which you cannot change the cover.'
|
||||
' WARNING: This option should only be used with newer '
|
||||
'SONY readers: 350, 650, 950 and newer.'),
|
||||
_('Refresh separate covers when using automatic management (newer readers)') +
|
||||
':::' +
|
||||
_('Set this option to have separate book covers uploaded '
|
||||
'every time you connect your device. Unset this option if '
|
||||
'you have so many books on the reader that performance is '
|
||||
'unacceptable.')
|
||||
]
|
||||
EXTRA_CUSTOMIZATION_DEFAULT = [
|
||||
', '.join(['series', 'tags']),
|
||||
False,
|
||||
False
|
||||
]
|
||||
|
||||
OPT_COLLECTIONS = 0
|
||||
OPT_UPLOAD_COVERS = 1
|
||||
OPT_REFRESH_COVERS = 2
|
||||
|
||||
plugboard = None
|
||||
plugboard_func = None
|
||||
@ -159,7 +182,7 @@ class PRS505(USBMS):
|
||||
opts = self.settings()
|
||||
if opts.extra_customization:
|
||||
collections = [x.strip() for x in
|
||||
opts.extra_customization.split(',')]
|
||||
opts.extra_customization[self.OPT_COLLECTIONS].split(',')]
|
||||
else:
|
||||
collections = []
|
||||
debug_print('PRS505: collection fields:', collections)
|
||||
@ -171,6 +194,20 @@ class PRS505(USBMS):
|
||||
c.update(blists, collections, pb)
|
||||
c.write()
|
||||
|
||||
if opts.extra_customization[self.OPT_REFRESH_COVERS]:
|
||||
debug_print('PRS505: uploading covers in sync_booklists')
|
||||
for idx,bl in blists.items():
|
||||
prefix = self._card_a_prefix if idx == 1 else \
|
||||
self._card_b_prefix if idx == 2 \
|
||||
else self._main_prefix
|
||||
for book in bl:
|
||||
p = os.path.join(prefix, book.lpath)
|
||||
self._upload_cover(os.path.dirname(p),
|
||||
os.path.splitext(os.path.basename(p))[0],
|
||||
book, p)
|
||||
else:
|
||||
debug_print('PRS505: NOT uploading covers in sync_booklists')
|
||||
|
||||
USBMS.sync_booklists(self, booklists, end_session=end_session)
|
||||
debug_print('PRS505: finished sync_booklists')
|
||||
|
||||
@ -186,8 +223,15 @@ class PRS505(USBMS):
|
||||
self.plugboard_func = pb_func
|
||||
|
||||
def upload_cover(self, path, filename, metadata, filepath):
|
||||
return # Disabled as the SONY's don't need this thumbnail anyway and
|
||||
# older models don't auto delete it
|
||||
opts = self.settings()
|
||||
if not opts.extra_customization[self.OPT_UPLOAD_COVERS]:
|
||||
# Building thumbnails disabled
|
||||
debug_print('PRS505: not uploading cover')
|
||||
return
|
||||
debug_print('PRS505: uploading cover')
|
||||
self._upload_cover(path, filename, metadata, filepath)
|
||||
|
||||
def _upload_cover(self, path, filename, metadata, filepath):
|
||||
if metadata.thumbnail and metadata.thumbnail[-1]:
|
||||
path = path.replace('/', os.sep)
|
||||
is_main = path.startswith(self._main_prefix)
|
||||
|
@ -30,6 +30,12 @@ class Drive(str):
|
||||
typ.order = order
|
||||
return typ
|
||||
|
||||
def drivecmp(a, b):
|
||||
ans = cmp(getattr(a, 'order', 0), getattr(b, 'order', 0))
|
||||
if ans == 0:
|
||||
ans = cmp(a, b)
|
||||
return ans
|
||||
|
||||
|
||||
class WinPNPScanner(object):
|
||||
|
||||
@ -57,7 +63,13 @@ class WinPNPScanner(object):
|
||||
order = 0
|
||||
match = re.search(r'REV_.*?&(\d+)#', pnp_id)
|
||||
if match is None:
|
||||
match = re.search(r'REV_.*?&(\d+)', pnp_id)
|
||||
# Windows XP
|
||||
# On the Nook Color this is the last digit
|
||||
#
|
||||
# USBSTOR\DISK&VEN_B&N&PROD_EBOOK_DISK&REV_0100\7&13EAFDB8&0&2004760017462009&1
|
||||
# USBSTOR\DISK&VEN_B&N&PROD_EBOOK_DISK&REV_0100\7&13EAFDB8&0&2004760017462009&0
|
||||
#
|
||||
match = re.search(r'REV_.*&(\d+)', pnp_id)
|
||||
if match is not None:
|
||||
order = int(match.group(1))
|
||||
return order
|
||||
|
@ -140,11 +140,19 @@ class CollectionsBookList(BookList):
|
||||
all_by_author = ''
|
||||
all_by_title = ''
|
||||
ca = []
|
||||
all_by_something = []
|
||||
for c in collection_attributes:
|
||||
if c.startswith('aba:') and c[4:]:
|
||||
if c.startswith('aba:') and c[4:].strip():
|
||||
all_by_author = c[4:].strip()
|
||||
elif c.startswith('abt:') and c[4:]:
|
||||
elif c.startswith('abt:') and c[4:].strip():
|
||||
all_by_title = c[4:].strip()
|
||||
elif c.startswith('abs:') and c[4:].strip():
|
||||
name = c[4:].strip()
|
||||
sby = self.in_category_sort_rules(name)
|
||||
if sby is None:
|
||||
sby = name
|
||||
if name and sby:
|
||||
all_by_something.append((name, sby))
|
||||
else:
|
||||
ca.append(c.lower())
|
||||
collection_attributes = ca
|
||||
@ -251,6 +259,10 @@ class CollectionsBookList(BookList):
|
||||
if all_by_title not in collections:
|
||||
collections[all_by_title] = {}
|
||||
collections[all_by_title][lpath] = (book, tsval, asval)
|
||||
for (n, sb) in all_by_something:
|
||||
if n not in collections:
|
||||
collections[n] = {}
|
||||
collections[n][lpath] = (book, book.get(sb, ''), tsval)
|
||||
|
||||
# Sort collections
|
||||
result = {}
|
||||
|
@ -11,7 +11,7 @@ intended to be subclassed with the relevant parts implemented for a particular
|
||||
device. This class handles device detection.
|
||||
'''
|
||||
|
||||
import os, subprocess, time, re, sys, glob, operator
|
||||
import os, subprocess, time, re, sys, glob
|
||||
from itertools import repeat
|
||||
|
||||
from calibre.devices.interface import DevicePlugin
|
||||
@ -225,7 +225,7 @@ class Device(DeviceConfig, DevicePlugin):
|
||||
return False
|
||||
|
||||
def open_windows(self):
|
||||
from calibre.devices.scanner import win_pnp_drives
|
||||
from calibre.devices.scanner import win_pnp_drives, drivecmp
|
||||
|
||||
time.sleep(5)
|
||||
drives = {}
|
||||
@ -263,7 +263,7 @@ class Device(DeviceConfig, DevicePlugin):
|
||||
if self.WINDOWS_MAIN_MEM in (self.WINDOWS_CARD_A_MEM,
|
||||
self.WINDOWS_CARD_B_MEM) or \
|
||||
self.WINDOWS_CARD_A_MEM == self.WINDOWS_CARD_B_MEM:
|
||||
letters = sorted(drives.values(), key=operator.attrgetter('order'))
|
||||
letters = sorted(drives.values(), cmp=drivecmp)
|
||||
drives = {}
|
||||
for which, letter in zip(['main', 'carda', 'cardb'], letters):
|
||||
drives[which] = letter
|
||||
|
@ -10,7 +10,21 @@ from calibre.utils.config import Config, ConfigProxy
|
||||
class DeviceConfig(object):
|
||||
|
||||
HELP_MESSAGE = _('Configure Device')
|
||||
|
||||
#: Can be None, a string or a list of strings. When it is a string
|
||||
#: that string is used for the help text and the actual customization value
|
||||
#: can be read from ``dev.settings().extra_customization``.
|
||||
#: If it a list of strings, then dev.settings().extra_customization will
|
||||
#: also be a list. In this case, you *must* ensure that
|
||||
#: EXTRA_CUSTOMIZATION_DEFAULT is also a list. The list can contain either
|
||||
#: boolean values or strings, in which case a checkbox or line edit will be
|
||||
#: used for them in the config widget, automatically.
|
||||
#: If a string contains ::: then the text after it is interpreted as the
|
||||
#: tooltip
|
||||
EXTRA_CUSTOMIZATION_MESSAGE = None
|
||||
|
||||
#: The default value for extra customization. If you set
|
||||
#: EXTRA_CUSTOMIZATION_MESSAGE you *must* set this as well.
|
||||
EXTRA_CUSTOMIZATION_DEFAULT = None
|
||||
|
||||
SUPPORTS_SUB_DIRS = False
|
||||
@ -73,16 +87,33 @@ class DeviceConfig(object):
|
||||
if cls.SUPPORTS_USE_AUTHOR_SORT:
|
||||
proxy['use_author_sort'] = config_widget.use_author_sort()
|
||||
if cls.EXTRA_CUSTOMIZATION_MESSAGE:
|
||||
ec = unicode(config_widget.opt_extra_customization.text()).strip()
|
||||
if not ec:
|
||||
ec = None
|
||||
if isinstance(cls.EXTRA_CUSTOMIZATION_MESSAGE, list):
|
||||
ec = []
|
||||
for i in range(0, len(cls.EXTRA_CUSTOMIZATION_MESSAGE)):
|
||||
if hasattr(config_widget.opt_extra_customization[i], 'isChecked'):
|
||||
ec.append(config_widget.opt_extra_customization[i].isChecked())
|
||||
else:
|
||||
ec.append(unicode(config_widget.opt_extra_customization[i].text()).strip())
|
||||
else:
|
||||
ec = unicode(config_widget.opt_extra_customization.text()).strip()
|
||||
if not ec:
|
||||
ec = None
|
||||
proxy['extra_customization'] = ec
|
||||
st = unicode(config_widget.opt_save_template.text())
|
||||
proxy['save_template'] = st
|
||||
|
||||
@classmethod
|
||||
def settings(cls):
|
||||
return cls._config().parse()
|
||||
opts = cls._config().parse()
|
||||
if isinstance(cls.EXTRA_CUSTOMIZATION_DEFAULT, list):
|
||||
if opts.extra_customization is None:
|
||||
opts.extra_customization = []
|
||||
if not isinstance(opts.extra_customization, list):
|
||||
opts.extra_customization = [opts.extra_customization]
|
||||
for i,d in enumerate(cls.EXTRA_CUSTOMIZATION_DEFAULT):
|
||||
if i >= len(opts.extra_customization):
|
||||
opts.extra_customization.append(d)
|
||||
return opts
|
||||
|
||||
@classmethod
|
||||
def save_template(cls):
|
||||
|
@ -18,7 +18,7 @@
|
||||
|
||||
__version__ = "1.0"
|
||||
|
||||
import re
|
||||
import re, codecs
|
||||
|
||||
def detect(aBuf):
|
||||
import calibre.ebooks.chardet.universaldetector as universaldetector
|
||||
@ -83,9 +83,11 @@ def xml_to_unicode(raw, verbose=False, strip_encoding_pats=False,
|
||||
if not raw:
|
||||
return u'', encoding
|
||||
if not isinstance(raw, unicode):
|
||||
if raw.startswith('\xff\xfe'):
|
||||
if raw.startswith(codecs.BOM_UTF8):
|
||||
raw, encoding = raw.decode('utf-8')[1:], 'utf-8'
|
||||
elif raw.startswith(codecs.BOM_UTF16_LE):
|
||||
raw, encoding = raw.decode('utf-16-le')[1:], 'utf-16-le'
|
||||
elif raw.startswith('\xfe\xff'):
|
||||
elif raw.startswith(codecs.BOM_UTF16_BE):
|
||||
raw, encoding = raw.decode('utf-16-be')[1:], 'utf-16-be'
|
||||
if not isinstance(raw, unicode):
|
||||
for pat in ENCODING_PATS:
|
||||
|
@ -6,11 +6,118 @@ __docformat__ = 'restructuredtext en'
|
||||
|
||||
import re
|
||||
|
||||
class TCRCompressor(object):
|
||||
'''
|
||||
TCR compression takes the form header+code_dict+coded_text.
|
||||
The header is always "!!8-Bit!!". The code dict is a list of 256 strings.
|
||||
The list takes the form 1 byte length and then a string. Each position in
|
||||
The list corresponds to a code found in the file. The coded text is
|
||||
string of characters values. for instance the character Q represents the
|
||||
value 81 which corresponds to the string in the code list at position 81.
|
||||
'''
|
||||
|
||||
def _reset(self):
|
||||
# List of indexes in the codes list that are empty and can hold new codes
|
||||
self.unused_codes = set()
|
||||
self.coded_txt = ''
|
||||
# Generate initial codes from text.
|
||||
# The index of the list will be the code that represents the characters at that location
|
||||
# in the list
|
||||
self.codes = []
|
||||
|
||||
def _combine_codes(self):
|
||||
'''
|
||||
Combine two codes that always appear in pair into a single code.
|
||||
The intent is to create more unused codes.
|
||||
'''
|
||||
possible_codes = []
|
||||
a_code = set(re.findall('(?msu).', self.coded_txt))
|
||||
|
||||
for code in a_code:
|
||||
single_code = set(re.findall('(?msu)%s.' % re.escape(code), self.coded_txt))
|
||||
if len(single_code) == 1:
|
||||
possible_codes.append(single_code.pop())
|
||||
|
||||
for code in possible_codes:
|
||||
self.coded_txt = self.coded_txt.replace(code, code[0])
|
||||
self.codes[ord(code[0])] = '%s%s' % (self.codes[ord(code[0])], self.codes[ord(code[1])])
|
||||
|
||||
def _free_unused_codes(self):
|
||||
'''
|
||||
Look for codes that do no not appear in the coded text and add them to
|
||||
the list of free codes.
|
||||
'''
|
||||
for i in xrange(256):
|
||||
if i not in self.unused_codes:
|
||||
if chr(i) not in self.coded_txt:
|
||||
self.unused_codes.add(i)
|
||||
|
||||
def _new_codes(self):
|
||||
'''
|
||||
Create new codes from codes that occur in pairs often.
|
||||
'''
|
||||
possible_new_codes = list(set(re.findall('(?msu)..', self.coded_txt)))
|
||||
new_codes_count = []
|
||||
|
||||
for c in possible_new_codes:
|
||||
count = self.coded_txt.count(c)
|
||||
# Less than 3 occurrences will not produce any size reduction.
|
||||
if count > 2:
|
||||
new_codes_count.append((c, count))
|
||||
|
||||
# Arrange the codes in order of least to most occurring.
|
||||
possible_new_codes = [x[0] for x in sorted(new_codes_count, key=lambda c: c[1])]
|
||||
|
||||
return possible_new_codes
|
||||
|
||||
def compress(self, txt):
|
||||
self._reset()
|
||||
|
||||
self.codes = list(set(re.findall('(?msu).', txt)))
|
||||
|
||||
# Replace the text with their corresponding code
|
||||
for c in txt:
|
||||
self.coded_txt += chr(self.codes.index(c))
|
||||
|
||||
# Zero the unused codes and record which are unused.
|
||||
for i in range(len(self.codes), 256):
|
||||
self.codes.append('')
|
||||
self.unused_codes.add(i)
|
||||
|
||||
self._combine_codes()
|
||||
possible_codes = self._new_codes()
|
||||
|
||||
while possible_codes and self.unused_codes:
|
||||
while possible_codes and self.unused_codes:
|
||||
unused_code = self.unused_codes.pop()
|
||||
# Take the last possible codes and split it into individual
|
||||
# codes. The last possible code is the most often occurring.
|
||||
code1, code2 = possible_codes.pop()
|
||||
self.codes[unused_code] = '%s%s' % (self.codes[ord(code1)], self.codes[ord(code2)])
|
||||
self.coded_txt = self.coded_txt.replace('%s%s' % (code1, code2), chr(unused_code))
|
||||
self._combine_codes()
|
||||
self._free_unused_codes()
|
||||
possible_codes = self._new_codes()
|
||||
|
||||
self._free_unused_codes()
|
||||
|
||||
# Generate the code dictionary.
|
||||
code_dict = []
|
||||
for i in xrange(0, 256):
|
||||
if i in self.unused_codes:
|
||||
code_dict.append(chr(0))
|
||||
else:
|
||||
code_dict.append(chr(len(self.codes[i])) + self.codes[i])
|
||||
|
||||
# Join the identifier with the dictionary and coded text.
|
||||
return '!!8-Bit!!'+''.join(code_dict)+self.coded_txt
|
||||
|
||||
|
||||
def decompress(stream):
|
||||
txt = []
|
||||
stream.seek(0)
|
||||
if stream.read(9) != '!!8-Bit!!':
|
||||
raise ValueError('File %s contaions an invalid TCR header.' % stream.name)
|
||||
raise ValueError('File %s contains an invalid TCR header.' % stream.name)
|
||||
|
||||
# Codes that the file contents are broken down into.
|
||||
entries = []
|
||||
@ -26,101 +133,6 @@ def decompress(stream):
|
||||
|
||||
return ''.join(txt)
|
||||
|
||||
|
||||
def compress(txt, level=5):
|
||||
'''
|
||||
TCR compression takes the form header+code_list+coded_text.
|
||||
The header is always "!!8-Bit!!". The code list is a list of 256 strings.
|
||||
The list takes the form 1 byte length and then a string. Each position in
|
||||
The list corresponds to a code found in the file. The coded text is
|
||||
string of characters vaules. for instance the character Q represents the
|
||||
value 81 which corresponds to the string in the code list at position 81.
|
||||
'''
|
||||
# Turn each unique character into a coded value.
|
||||
# The code of the string at a given position are represented by the position
|
||||
# they occupy in the list.
|
||||
codes = list(set(re.findall('(?msu).', txt)))
|
||||
for i in range(len(codes), 256):
|
||||
codes.append('')
|
||||
# Set the compression level.
|
||||
if level <= 1:
|
||||
new_length = 256
|
||||
if level >= 10:
|
||||
new_length = 1
|
||||
else:
|
||||
new_length = int(256 * (10 - level) * .1)
|
||||
new_length = 1 if new_length < 1 else new_length
|
||||
# Replace txt with codes.
|
||||
coded_txt = ''
|
||||
for c in txt:
|
||||
coded_txt += chr(codes.index(c))
|
||||
txt = coded_txt
|
||||
# Start compressing the text.
|
||||
new = True
|
||||
merged = True
|
||||
while new or merged:
|
||||
# Merge codes that always follow another code
|
||||
merge = []
|
||||
merged = False
|
||||
for i in xrange(256):
|
||||
if codes[i] != '':
|
||||
# Find all codes that are next to i.
|
||||
fall = list(set(re.findall('(?msu)%s.' % re.escape(chr(i)), txt)))
|
||||
# 1 if only one code comes after i.
|
||||
if len(fall) == 1:
|
||||
# We are searching codes and each code is always 1 character.
|
||||
j = ord(fall[0][1:2])
|
||||
# Only merge if the total length of the string represented by
|
||||
# code is less than 256.
|
||||
if len(codes[i]) + len(codes[j]) < 256:
|
||||
merge.append((i, j))
|
||||
if merge:
|
||||
merged = True
|
||||
for i, j in merge:
|
||||
# Merge the string for j into the string for i.
|
||||
if i == j:
|
||||
# Don't use += here just in case something goes wrong. This
|
||||
# will prevent out of control memory consumption. This is
|
||||
# unecessary but when creating this routine it happened due
|
||||
# to an error.
|
||||
codes[i] = codes[i] + codes[i]
|
||||
else:
|
||||
codes[i] = codes[i] + codes[j]
|
||||
txt = txt.replace(chr(i)+chr(j), chr(i))
|
||||
if chr(j) not in txt:
|
||||
codes[j] = ''
|
||||
new = False
|
||||
if '' in codes:
|
||||
# Create a list of codes based on combinations of codes that are next
|
||||
# to each other. The amount of savings for the new code is calculated.
|
||||
new_codes = []
|
||||
for c in list(set(re.findall('(?msu)..', txt))):
|
||||
i = ord(c[0:1])
|
||||
j = ord(c[1:2])
|
||||
if codes[i]+codes[j] in codes:
|
||||
continue
|
||||
savings = txt.count(chr(i)+chr(j)) - len(codes[i]) - len(codes[j])
|
||||
if savings > 2 and len(codes[i]) + len(codes[j]) < 256:
|
||||
new_codes.append((savings, i, j, codes[i], codes[j]))
|
||||
if new_codes:
|
||||
new = True
|
||||
# Sort the codes from highest savings to lowest.
|
||||
new_codes.sort(lambda x, y: -1 if x[0] > y[0] else 1 if x[0] < y[0] else 0)
|
||||
# The shorter new_length the more chances time merging will happen
|
||||
# giving more changes for better codes to be created. However,
|
||||
# the shorter new_lengh the longer it will take to compress.
|
||||
new_codes = new_codes[:new_length]
|
||||
for code in new_codes:
|
||||
if '' not in codes:
|
||||
break
|
||||
c = codes.index('')
|
||||
codes[c] = code[3]+code[4]
|
||||
txt = txt.replace(chr(code[1])+chr(code[2]), chr(c))
|
||||
# Generate the code dictionary.
|
||||
header = []
|
||||
for code in codes:
|
||||
header.append(chr(len(code))+code)
|
||||
for i in xrange(len(header), 256):
|
||||
header.append(chr(0))
|
||||
# Join the identifier with the dictionary and coded text.
|
||||
return '!!8-Bit!!'+''.join(header)+txt
|
||||
def compress(txt):
|
||||
t = TCRCompressor()
|
||||
return t.compress(txt)
|
||||
|
@ -88,6 +88,7 @@ class Plumber(object):
|
||||
self.ui_reporter = report_progress
|
||||
self.abort_after_input_dump = abort_after_input_dump
|
||||
|
||||
# Pipeline options {{{
|
||||
# Initialize the conversion options that are independent of input and
|
||||
# output formats. The input and output plugins can still disable these
|
||||
# options via recommendations.
|
||||
@ -527,6 +528,7 @@ OptionRecommendation(name='timestamp',
|
||||
help=_('Set the book timestamp (used by the date column in calibre).')),
|
||||
|
||||
]
|
||||
# }}}
|
||||
|
||||
input_fmt = os.path.splitext(self.input)[1]
|
||||
if not input_fmt:
|
||||
@ -977,6 +979,8 @@ def create_oebbook(log, path_or_stream, opts, input_plugin, reader=None,
|
||||
from calibre.ebooks.oeb.base import OEBBook
|
||||
html_preprocessor = HTMLPreProcessor(input_plugin.preprocess_html,
|
||||
opts.preprocess_html, opts)
|
||||
if not encoding:
|
||||
encoding = None
|
||||
oeb = OEBBook(log, html_preprocessor,
|
||||
pretty_print=opts.pretty_print, input_encoding=encoding)
|
||||
if not populate:
|
||||
|
@ -51,16 +51,16 @@ def chap_head(match):
|
||||
chap = match.group('chap')
|
||||
title = match.group('title')
|
||||
if not title:
|
||||
return '<h1>'+chap+'</h1><br/>\n'
|
||||
return '<h1>'+chap+'</h1><br/>\n'
|
||||
else:
|
||||
return '<h1>'+chap+'</h1>\n<h3>'+title+'</h3>\n'
|
||||
return '<h1>'+chap+'</h1>\n<h3>'+title+'</h3>\n'
|
||||
|
||||
def wrap_lines(match):
|
||||
ital = match.group('ital')
|
||||
if not ital:
|
||||
return ' '
|
||||
return ' '
|
||||
else:
|
||||
return ital+' '
|
||||
return ital+' '
|
||||
|
||||
class DocAnalysis(object):
|
||||
'''
|
||||
@ -78,6 +78,8 @@ class DocAnalysis(object):
|
||||
linere = re.compile('(?<=<br>)(?!\s*<br>).*?(?=<br>)', re.DOTALL)
|
||||
elif format == 'spanned_html':
|
||||
linere = re.compile('(?<=<span).*?(?=</span>)', re.DOTALL)
|
||||
elif format == 'txt':
|
||||
linere = re.compile('.*?\n')
|
||||
self.lines = linere.findall(raw)
|
||||
|
||||
def line_length(self, percent):
|
||||
@ -175,7 +177,7 @@ class Dehyphenator(object):
|
||||
def __init__(self):
|
||||
# Add common suffixes to the regex below to increase the likelihood of a match -
|
||||
# don't add suffixes which are also complete words, such as 'able' or 'sex'
|
||||
self.removesuffixes = re.compile(r"((ed)?ly|('e)?s|a?(t|s)?ion(s|al(ly)?)?|ings?|er|(i)?ous|(i|a)ty|(it)?ies|ive|gence|istic(ally)?|(e|a)nce|ment(s)?|ism|ated|(e|u)ct(ed)?|ed|(i|ed)?ness|(e|a)ncy|ble|ier|al|ex)$", re.IGNORECASE)
|
||||
self.removesuffixes = re.compile(r"((ed)?ly|('e)?s|a?(t|s)?ion(s|al(ly)?)?|ings?|er|(i)?ous|(i|a)ty|(it)?ies|ive|gence|istic(ally)?|(e|a)nce|m?ents?|ism|ated|(e|u)ct(ed)?|ed|(i|ed)?ness|(e|a)ncy|ble|ier|al|ex|ian)$", re.IGNORECASE)
|
||||
# remove prefixes if the prefix was not already the point of hyphenation
|
||||
self.prefixes = re.compile(r'^(dis|re|un|in|ex)$', re.IGNORECASE)
|
||||
self.removeprefix = re.compile(r'^(dis|re|un|in|ex)', re.IGNORECASE)
|
||||
@ -191,13 +193,13 @@ class Dehyphenator(object):
|
||||
dehyphenated = unicode(firsthalf) + unicode(secondhalf)
|
||||
lookupword = self.removesuffixes.sub('', dehyphenated)
|
||||
if self.prefixes.match(firsthalf) is None:
|
||||
lookupword = self.removeprefix.sub('', lookupword)
|
||||
lookupword = self.removeprefix.sub('', lookupword)
|
||||
#print "lookup word is: "+str(lookupword)+", orig is: " + str(hyphenated)
|
||||
try:
|
||||
searchresult = self.html.find(lookupword.lower())
|
||||
except:
|
||||
return hyphenated
|
||||
if self.format == 'html_cleanup':
|
||||
if self.format == 'html_cleanup' or self.format == 'txt_cleanup':
|
||||
if self.html.find(lookupword) != -1 or searchresult != -1:
|
||||
#print "Cleanup:returned dehyphenated word: " + str(dehyphenated)
|
||||
return dehyphenated
|
||||
@ -223,10 +225,15 @@ class Dehyphenator(object):
|
||||
intextmatch = re.compile(u'(?<=.{%i})(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)\s*(?=<)(?P<wraptags>(</span>)?\s*(</[iubp]>\s*){1,2}(?P<up2threeblanks><(p|div)[^>]*>\s*(<p[^>]*>\s*</p>\s*)?</(p|div)>\s+){0,3}\s*(<[iubp][^>]*>\s*){1,2}(<span[^>]*>)?)\s*(?P<secondpart>[\w\d]+)' % length)
|
||||
elif format == 'pdf':
|
||||
intextmatch = re.compile(u'(?<=.{%i})(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)\s*(?P<wraptags><p>|</[iub]>\s*<p>\s*<[iub]>)\s*(?P<secondpart>[\w\d]+)'% length)
|
||||
elif format == 'txt':
|
||||
intextmatch = re.compile(u'(?<=.{%i})(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)(\u0020|\u0009)*(?P<wraptags>(\n(\u0020|\u0009)*)+)(?P<secondpart>[\w\d]+)'% length)
|
||||
elif format == 'individual_words':
|
||||
intextmatch = re.compile(u'>[^<]*\b(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)"\s>]+)(-|‐)(?P<secondpart)\w+)\b[^<]*<') # for later, not called anywhere yet
|
||||
intextmatch = re.compile(u'>[^<]*\b(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)"\s>]+)(-|‐)\u0020*(?P<secondpart>\w+)\b[^<]*<') # for later, not called anywhere yet
|
||||
elif format == 'html_cleanup':
|
||||
intextmatch = re.compile(u'(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)\s*(?=<)(?P<wraptags></span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?P<secondpart>[\w\d]+)')
|
||||
elif format == 'txt_cleanup':
|
||||
intextmatch = re.compile(u'(?P<firstpart>\w+)(-|‐)(?P<wraptags>\s+)(?P<secondpart>[\w\d]+)')
|
||||
|
||||
|
||||
html = intextmatch.sub(self.dehyphenate, html)
|
||||
return html
|
||||
@ -353,7 +360,7 @@ class HTMLPreProcessor(object):
|
||||
(re.compile(r'((?<=</a>)\s*file:////?[A-Z].*<br>|file:////?[A-Z].*<br>(?=\s*<hr>))', re.IGNORECASE), lambda match: ''),
|
||||
|
||||
# Center separator lines
|
||||
(re.compile(u'<br>\s*(?P<break>([*#•]+\s*)+)\s*<br>'), lambda match: '<p>\n<p style="text-align:center">' + match.group(1) + '</p>'),
|
||||
(re.compile(u'<br>\s*(?P<break>([*#•✦]+\s*)+)\s*<br>'), lambda match: '<p>\n<p style="text-align:center">' + match.group(1) + '</p>'),
|
||||
|
||||
# Remove page links
|
||||
(re.compile(r'<a name=\d+></a>', re.IGNORECASE), lambda match: ''),
|
||||
@ -363,13 +370,11 @@ class HTMLPreProcessor(object):
|
||||
# Remove gray background
|
||||
(re.compile(r'<BODY[^<>]+>'), lambda match : '<BODY>'),
|
||||
|
||||
# Detect Chapters to match default XPATH in GUI
|
||||
(re.compile(r'<br>\s*(?P<chap>(<[ibu]>){0,2}\s*.?(Introduction|Chapter|Kapitel|Epilogue|Prologue|Book|Part|Dedication|Volume|Preface|Acknowledgments)\s*([\d\w-]+\s*){0,3}\s*(</[ibu]>){0,2})\s*(<br>\s*){1,3}\s*(?P<title>(<[ibu]>){0,2}(\s*\w+){1,4}\s*(</[ibu]>){0,2}\s*<br>)?', re.IGNORECASE), chap_head),
|
||||
# Cover the case where every letter in a chapter title is separated by a space
|
||||
(re.compile(r'<br>\s*(?P<chap>([A-Z]\s+){4,}\s*([\d\w-]+\s*){0,3}\s*)\s*(<br>\s*){1,3}\s*(?P<title>(<[ibu]>){0,2}(\s*\w+){1,4}\s*(</[ibu]>){0,2}\s*(<br>))?'), chap_head),
|
||||
# Convert line breaks to paragraphs
|
||||
(re.compile(r'<br[^>]*>\s*'), lambda match : '</p>\n<p>'),
|
||||
(re.compile(r'<body[^>]*>\s*'), lambda match : '<body>\n<p>'),
|
||||
(re.compile(r'\s*</body>'), lambda match : '</p>\n</body>'),
|
||||
|
||||
# Have paragraphs show better
|
||||
(re.compile(r'<br.*?>'), lambda match : '<p>'),
|
||||
# Clean up spaces
|
||||
(re.compile(u'(?<=[\.,;\?!”"\'])[\s^ ]*(?=<)'), lambda match: ' '),
|
||||
# Add space before and after italics
|
||||
@ -455,9 +460,9 @@ class HTMLPreProcessor(object):
|
||||
# delete soft hyphens - moved here so it's executed after header/footer removal
|
||||
if is_pdftohtml:
|
||||
# unwrap/delete soft hyphens
|
||||
end_rules.append((re.compile(u'[](\s*<p>)+\s*(?=[[a-z\d])'), lambda match: ''))
|
||||
end_rules.append((re.compile(u'[](</p>\s*<p>\s*)+\s*(?=[[a-z\d])'), lambda match: ''))
|
||||
# unwrap/delete soft hyphens with formatting
|
||||
end_rules.append((re.compile(u'[]\s*(</(i|u|b)>)+(\s*<p>)+\s*(<(i|u|b)>)+\s*(?=[[a-z\d])'), lambda match: ''))
|
||||
end_rules.append((re.compile(u'[]\s*(</(i|u|b)>)+(</p>\s*<p>\s*)+\s*(<(i|u|b)>)+\s*(?=[[a-z\d])'), lambda match: ''))
|
||||
|
||||
# Make the more aggressive chapter marking regex optional with the preprocess option to
|
||||
# reduce false positives and move after header/footer removal
|
||||
@ -475,7 +480,7 @@ class HTMLPreProcessor(object):
|
||||
end_rules.append((re.compile(u'(?<=.{%i}[–—])\s*<p>\s*(?=[[a-z\d])' % length), lambda match: ''))
|
||||
end_rules.append(
|
||||
# Un wrap using punctuation
|
||||
(re.compile(u'(?<=.{%i}([a-zäëïöüàèìòùáćéíóńśúâêîôûçąężı,:)\IA\u00DF]|(?<!\&\w{4});))\s*(?P<ital></(i|b|u)>)?\s*(<p.*?>\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines),
|
||||
(re.compile(u'(?<=.{%i}([a-zäëïöüàèìòùáćéíóńśúâêîôûçąężıãõñæøþðß,:)\IA\u00DF]|(?<!\&\w{4});))\s*(?P<ital></(i|b|u)>)?\s*(</p>\s*<p>\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines),
|
||||
)
|
||||
|
||||
for rule in self.PREPROCESS + start_rules:
|
||||
@ -508,7 +513,15 @@ class HTMLPreProcessor(object):
|
||||
if is_pdftohtml and length > -1:
|
||||
# Dehyphenate
|
||||
dehyphenator = Dehyphenator()
|
||||
html = dehyphenator(html,'pdf', length)
|
||||
html = dehyphenator(html,'html', length)
|
||||
|
||||
if is_pdftohtml:
|
||||
from calibre.ebooks.conversion.utils import PreProcessor
|
||||
pdf_markup = PreProcessor(self.extra_opts, None)
|
||||
totalwords = 0
|
||||
totalwords = pdf_markup.get_word_count(html)
|
||||
if totalwords > 7000:
|
||||
html = pdf_markup.markup_chapters(html, totalwords, True)
|
||||
|
||||
#dump(html, 'post-preprocess')
|
||||
|
||||
@ -554,5 +567,9 @@ class HTMLPreProcessor(object):
|
||||
html = smartyPants(html)
|
||||
html = html.replace(start, '<!--')
|
||||
html = html.replace(stop, '-->')
|
||||
# convert ellipsis to entities to prevent wrapping
|
||||
html = re.sub(r'(?u)(?<=\w)\s?(\.\s?){2}\.', '…', html)
|
||||
# convert double dashes to em-dash
|
||||
html = re.sub(r'\s--\s', u'\u2014', html)
|
||||
return substitute_entites(html)
|
||||
|
||||
|
@ -6,8 +6,10 @@ __copyright__ = '2010, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import re
|
||||
from math import ceil
|
||||
from calibre.ebooks.conversion.preprocess import DocAnalysis, Dehyphenator
|
||||
from calibre.utils.logging import default_log
|
||||
from calibre.utils.wordcount import get_wordcount_obj
|
||||
|
||||
class PreProcessor(object):
|
||||
|
||||
@ -17,6 +19,9 @@ class PreProcessor(object):
|
||||
self.found_indents = 0
|
||||
self.extra_opts = extra_opts
|
||||
|
||||
def is_pdftohtml(self, src):
|
||||
return '<!-- created by calibre\'s pdftohtml -->' in src[:1000]
|
||||
|
||||
def chapter_head(self, match):
|
||||
chap = match.group('chap')
|
||||
title = match.group('title')
|
||||
@ -64,7 +69,7 @@ class PreProcessor(object):
|
||||
inspect. Percent is the minimum percent of line endings which should
|
||||
be marked up to return true.
|
||||
'''
|
||||
htm_end_ere = re.compile('</p>', re.DOTALL)
|
||||
htm_end_ere = re.compile('</(p|div)>', re.DOTALL)
|
||||
line_end_ere = re.compile('(\n|\r|\r\n)', re.DOTALL)
|
||||
htm_end = htm_end_ere.findall(raw)
|
||||
line_end = line_end_ere.findall(raw)
|
||||
@ -101,36 +106,140 @@ class PreProcessor(object):
|
||||
with open(os.path.join(odir, name), 'wb') as f:
|
||||
f.write(raw.encode('utf-8'))
|
||||
|
||||
def get_word_count(self, html):
|
||||
word_count_text = re.sub(r'(?s)<head[^>]*>.*?</head>', '', html)
|
||||
word_count_text = re.sub(r'<[^>]*>', '', word_count_text)
|
||||
wordcount = get_wordcount_obj(word_count_text)
|
||||
return wordcount.words
|
||||
|
||||
def markup_chapters(self, html, wordcount, blanks_between_paragraphs):
|
||||
# Typical chapters are between 2000 and 7000 words, use the larger number to decide the
|
||||
# minimum of chapters to search for
|
||||
self.min_chapters = 1
|
||||
if wordcount > 7000:
|
||||
self.min_chapters = int(ceil(wordcount / 7000.))
|
||||
#print "minimum chapters required are: "+str(self.min_chapters)
|
||||
heading = re.compile('<h[1-3][^>]*>', re.IGNORECASE)
|
||||
self.html_preprocess_sections = len(heading.findall(html))
|
||||
self.log("found " + unicode(self.html_preprocess_sections) + " pre-existing headings")
|
||||
|
||||
# Build the Regular Expressions in pieces
|
||||
init_lookahead = "(?=<(p|div))"
|
||||
chapter_line_open = "<(?P<outer>p|div)[^>]*>\s*(<(?P<inner1>font|span|[ibu])[^>]*>)?\s*(<(?P<inner2>font|span|[ibu])[^>]*>)?\s*(<(?P<inner3>font|span|[ibu])[^>]*>)?\s*"
|
||||
title_line_open = "<(?P<outer2>p|div)[^>]*>\s*(<(?P<inner4>font|span|[ibu])[^>]*>)?\s*(<(?P<inner5>font|span|[ibu])[^>]*>)?\s*(<(?P<inner6>font|span|[ibu])[^>]*>)?\s*"
|
||||
chapter_header_open = r"(?P<chap>"
|
||||
title_header_open = r"(?P<title>"
|
||||
chapter_header_close = ")\s*"
|
||||
title_header_close = ")"
|
||||
chapter_line_close = "(</(?P=inner3)>)?\s*(</(?P=inner2)>)?\s*(</(?P=inner1)>)?\s*</(?P=outer)>"
|
||||
title_line_close = "(</(?P=inner6)>)?\s*(</(?P=inner5)>)?\s*(</(?P=inner4)>)?\s*</(?P=outer2)>"
|
||||
|
||||
is_pdftohtml = self.is_pdftohtml(html)
|
||||
if is_pdftohtml:
|
||||
chapter_line_open = "<(?P<outer>p)[^>]*>(\s*<[ibu][^>]*>)?\s*"
|
||||
chapter_line_close = "\s*(</[ibu][^>]*>\s*)?</(?P=outer)>"
|
||||
title_line_open = "<(?P<outer2>p)[^>]*>\s*"
|
||||
title_line_close = "\s*</(?P=outer2)>"
|
||||
|
||||
|
||||
if blanks_between_paragraphs:
|
||||
blank_lines = "(\s*<p[^>]*>\s*</p>){0,2}\s*"
|
||||
else:
|
||||
blank_lines = ""
|
||||
opt_title_open = "("
|
||||
opt_title_close = ")?"
|
||||
n_lookahead_open = "\s+(?!"
|
||||
n_lookahead_close = ")"
|
||||
|
||||
default_title = r"(<[ibu][^>]*>)?\s{0,3}([\w\'\"-]+\s{0,3}){1,5}?(</[ibu][^>]*>)?(?=<)"
|
||||
|
||||
chapter_types = [
|
||||
[r"[^'\"]?(Introduction|Synopsis|Acknowledgements|Chapter|Kapitel|Epilogue|Volume\s|Prologue|Book\s|Part\s|Dedication|Preface)\s*([\d\w-]+\:?\'?\s*){0,5}", True, "Searching for common Chapter Headings"],
|
||||
[r"<b[^>]*>\s*(<span[^>]*>)?\s*(?!([*#•]+\s*)+)(\s*(?=[\d.\w#\-*\s]+<)([\d.\w#-*]+\s*){1,5}\s*)(?!\.)(</span>)?\s*</b>", True, "Searching for emphasized lines"], # Emphasized lines
|
||||
[r"[^'\"]?(\d+(\.|:)|CHAPTER)\s*([\dA-Z\-\'\"#,]+\s*){0,7}\s*", True, "Searching for numeric chapter headings"], # Numeric Chapters
|
||||
[r"([A-Z]\s+){3,}\s*([\d\w-]+\s*){0,3}\s*", True, "Searching for letter spaced headings"], # Spaced Lettering
|
||||
[r"[^'\"]?(\d+\.?\s+([\d\w-]+\:?\'?-?\s?){0,5})\s*", True, "Searching for numeric chapters with titles"], # Numeric Titles
|
||||
[r"[^'\"]?(\d+|CHAPTER)\s*([\dA-Z\-\'\"\?!#,]+\s*){0,7}\s*", True, "Searching for simple numeric chapter headings"], # Numeric Chapters, no dot or colon
|
||||
[r"\s*[^'\"]?([A-Z#]+(\s|-){0,3}){1,5}\s*", False, "Searching for chapters with Uppercase Characters" ] # Uppercase Chapters
|
||||
]
|
||||
|
||||
# Start with most typical chapter headings, get more aggressive until one works
|
||||
for [chapter_type, lookahead_ignorecase, log_message] in chapter_types:
|
||||
if self.html_preprocess_sections >= self.min_chapters:
|
||||
break
|
||||
full_chapter_line = chapter_line_open+chapter_header_open+chapter_type+chapter_header_close+chapter_line_close
|
||||
n_lookahead = re.sub("(ou|in|cha)", "lookahead_", full_chapter_line)
|
||||
self.log("Marked " + unicode(self.html_preprocess_sections) + " headings, " + log_message)
|
||||
if lookahead_ignorecase:
|
||||
chapter_marker = init_lookahead+full_chapter_line+blank_lines+n_lookahead_open+n_lookahead+n_lookahead_close+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close
|
||||
chapdetect = re.compile(r'%s' % chapter_marker, re.IGNORECASE)
|
||||
else:
|
||||
chapter_marker = init_lookahead+full_chapter_line+blank_lines+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close+n_lookahead_open+n_lookahead+n_lookahead_close
|
||||
chapdetect = re.compile(r'%s' % chapter_marker, re.UNICODE)
|
||||
html = chapdetect.sub(self.chapter_head, html)
|
||||
|
||||
words_per_chptr = wordcount
|
||||
if words_per_chptr > 0 and self.html_preprocess_sections > 0:
|
||||
words_per_chptr = wordcount / self.html_preprocess_sections
|
||||
self.log("Total wordcount is: "+ str(wordcount)+", Average words per section is: "+str(words_per_chptr)+", Marked up "+str(self.html_preprocess_sections)+" chapters")
|
||||
return html
|
||||
|
||||
def punctuation_unwrap(self, length, content, format):
|
||||
# define the pieces of the regex
|
||||
lookahead = "(?<=.{"+str(length)+"}([a-zäëïöüàèìòùáćéíóńśúâêîôûçąężıãõñæøþðß,:)\IA\u00DF]|(?<!\&\w{4});))" # (?<!\&\w{4});) is a semicolon not part of an entity
|
||||
line_ending = "\s*</(span|p|div)>\s*(</(p|span|div)>)?"
|
||||
blanklines = "\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*"
|
||||
line_opening = "<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*"
|
||||
txt_line_wrap = u"((\u0020|\u0009)*\n){1,4}"
|
||||
|
||||
unwrap_regex = lookahead+line_ending+blanklines+line_opening
|
||||
if format == 'txt':
|
||||
unwrap_regex = lookahead+txt_line_wrap
|
||||
|
||||
unwrap = re.compile(u"%s" % unwrap_regex, re.UNICODE)
|
||||
content = unwrap.sub(' ', content)
|
||||
return content
|
||||
|
||||
|
||||
def __call__(self, html):
|
||||
self.log("********* Preprocessing HTML *********")
|
||||
|
||||
# Count the words in the document to estimate how many chapters to look for and whether
|
||||
# other types of processing are attempted
|
||||
totalwords = 0
|
||||
totalwords = self.get_word_count(html)
|
||||
|
||||
if totalwords < 50:
|
||||
self.log("not enough text, not preprocessing")
|
||||
return html
|
||||
|
||||
# Arrange line feeds and </p> tags so the line_length and no_markup functions work correctly
|
||||
html = re.sub(r"\s*</p>", "</p>\n", html)
|
||||
html = re.sub(r"\s*<p(?P<style>[^>]*)>\s*", "\n<p"+"\g<style>"+">", html)
|
||||
html = re.sub(r"\s*</(?P<tag>p|div)>", "</"+"\g<tag>"+">\n", html)
|
||||
html = re.sub(r"\s*<(?P<tag>p|div)(?P<style>[^>]*)>\s*", "\n<"+"\g<tag>"+"\g<style>"+">", html)
|
||||
|
||||
###### Check Markup ######
|
||||
#
|
||||
# some lit files don't have any <p> tags or equivalent (generally just plain text between
|
||||
# <pre> tags), check and mark up line endings if required before proceeding
|
||||
if self.no_markup(html, 0.1):
|
||||
self.log("not enough paragraph markers, adding now")
|
||||
# check if content is in pre tags, use txt processor to mark up if so
|
||||
pre = re.compile(r'<pre>', re.IGNORECASE)
|
||||
if len(pre.findall(html)) == 1:
|
||||
self.log("Running Text Processing")
|
||||
from calibre.ebooks.txt.processor import convert_basic, preserve_spaces, \
|
||||
separate_paragraphs_single_line
|
||||
outerhtml = re.compile(r'.*?(?<=<pre>)(?P<text>.*)(?=</pre>).*', re.IGNORECASE|re.DOTALL)
|
||||
html = outerhtml.sub('\g<text>', html)
|
||||
html = separate_paragraphs_single_line(html)
|
||||
html = preserve_spaces(html)
|
||||
html = convert_basic(html, epub_split_size_kb=0)
|
||||
else:
|
||||
# Add markup naively
|
||||
# TODO - find out if there are cases where there are more than one <pre> tag or
|
||||
# other types of unmarked html and handle them in some better fashion
|
||||
add_markup = re.compile('(?<!>)(\n)')
|
||||
html = add_markup.sub('</p>\n<p>', html)
|
||||
self.log("not enough paragraph markers, adding now")
|
||||
# check if content is in pre tags, use txt processor to mark up if so
|
||||
pre = re.compile(r'<pre>', re.IGNORECASE)
|
||||
if len(pre.findall(html)) == 1:
|
||||
self.log("Running Text Processing")
|
||||
from calibre.ebooks.txt.processor import convert_basic, preserve_spaces, \
|
||||
separate_paragraphs_single_line
|
||||
outerhtml = re.compile(r'.*?(?<=<pre>)(?P<text>.*)(?=</pre>).*', re.IGNORECASE|re.DOTALL)
|
||||
html = outerhtml.sub('\g<text>', html)
|
||||
html = separate_paragraphs_single_line(html)
|
||||
html = preserve_spaces(html)
|
||||
html = convert_basic(html, epub_split_size_kb=0)
|
||||
else:
|
||||
# Add markup naively
|
||||
# TODO - find out if there are cases where there are more than one <pre> tag or
|
||||
# other types of unmarked html and handle them in some better fashion
|
||||
add_markup = re.compile('(?<!>)(\n)')
|
||||
html = add_markup.sub('</p>\n<p>', html)
|
||||
|
||||
###### Mark Indents/Cleanup ######
|
||||
#
|
||||
@ -141,12 +250,17 @@ class PreProcessor(object):
|
||||
self.log("replaced "+unicode(self.found_indents)+ " nbsp indents with inline styles")
|
||||
# remove remaining non-breaking spaces
|
||||
html = re.sub(ur'\u00a0', ' ', html)
|
||||
# Get rid of various common microsoft specific tags which can cause issues later
|
||||
# Get rid of empty <o:p> tags to simplify other processing
|
||||
html = re.sub(ur'\s*<o:p>\s*</o:p>', ' ', html)
|
||||
# Delete microsoft 'smart' tags
|
||||
html = re.sub('(?i)</?st1:\w+>', '', html)
|
||||
# Get rid of empty span, bold, & italics tags
|
||||
html = re.sub(r"\s*<span[^>]*>\s*(<span[^>]*>\s*</span>){0,2}\s*</span>\s*", " ", html)
|
||||
html = re.sub(r"\s*<[ibu][^>]*>\s*(<[ibu][^>]*>\s*</[ibu]>\s*){0,2}\s*</[ibu]>", " ", html)
|
||||
html = re.sub(r"\s*<span[^>]*>\s*(<span[^>]>\s*</span>){0,2}\s*</span>\s*", " ", html)
|
||||
# ADE doesn't render <br />, change to empty paragraphs
|
||||
#html = re.sub('<br[^>]*>', u'<p>\u00a0</p>', html)
|
||||
|
||||
# If more than 40% of the lines are empty paragraphs and the user has enabled remove
|
||||
# paragraph spacing then delete blank lines to clean up spacing
|
||||
@ -164,63 +278,16 @@ class PreProcessor(object):
|
||||
self.log("deleting blank lines")
|
||||
html = blankreg.sub('', html)
|
||||
elif float(len(blanklines)) / float(len(lines)) > 0.40:
|
||||
blanks_between_paragraphs = True
|
||||
#print "blanks between paragraphs is marked True"
|
||||
blanks_between_paragraphs = True
|
||||
#print "blanks between paragraphs is marked True"
|
||||
else:
|
||||
blanks_between_paragraphs = False
|
||||
|
||||
#self.dump(html, 'before_chapter_markup')
|
||||
# detect chapters/sections to match xpath or splitting logic
|
||||
#
|
||||
# Build the Regular Expressions in pieces
|
||||
init_lookahead = "(?=<(p|div))"
|
||||
chapter_line_open = "<(?P<outer>p|div)[^>]*>\s*(<(?P<inner1>font|span|[ibu])[^>]*>)?\s*(<(?P<inner2>font|span|[ibu])[^>]*>)?\s*(<(?P<inner3>font|span|[ibu])[^>]*>)?\s*"
|
||||
title_line_open = "<(?P<outer2>p|div)[^>]*>\s*(<(?P<inner4>font|span|[ibu])[^>]*>)?\s*(<(?P<inner5>font|span|[ibu])[^>]*>)?\s*(<(?P<inner6>font|span|[ibu])[^>]*>)?\s*"
|
||||
chapter_header_open = r"(?P<chap>"
|
||||
title_header_open = r"(?P<title>"
|
||||
chapter_header_close = ")\s*"
|
||||
title_header_close = ")"
|
||||
chapter_line_close = "(</(?P=inner3)>)?\s*(</(?P=inner2)>)?\s*(</(?P=inner1)>)?\s*</(?P=outer)>"
|
||||
title_line_close = "(</(?P=inner6)>)?\s*(</(?P=inner5)>)?\s*(</(?P=inner4)>)?\s*</(?P=outer2)>"
|
||||
|
||||
if blanks_between_paragraphs:
|
||||
blank_lines = "(\s*<p[^>]*>\s*</p>){0,2}\s*"
|
||||
else:
|
||||
blank_lines = ""
|
||||
opt_title_open = "("
|
||||
opt_title_close = ")?"
|
||||
n_lookahead_open = "\s+(?!"
|
||||
n_lookahead_close = ")"
|
||||
|
||||
default_title = r"\s{0,3}([\w\'\"-]+\s{0,3}){1,5}?(?=<)"
|
||||
|
||||
min_chapters = 10
|
||||
heading = re.compile('<h[1-3][^>]*>', re.IGNORECASE)
|
||||
self.html_preprocess_sections = len(heading.findall(html))
|
||||
self.log("found " + unicode(self.html_preprocess_sections) + " pre-existing headings")
|
||||
|
||||
chapter_types = [
|
||||
[r"[^'\"]?(Introduction|Synopsis|Acknowledgements|Chapter|Kapitel|Epilogue|Volume\s|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,4}", True, "Searching for common Chapter Headings"],
|
||||
[r"[^'\"]?(\d+\.?|CHAPTER)\s*([\dA-Z\-\'\"\?\.!#,]+\s*){0,7}\s*", True, "Searching for numeric chapter headings"], # Numeric Chapters
|
||||
[r"<b[^>]*>\s*(<span[^>]*>)?\s*(?!([*#•]+\s*)+)(\s*(?=[\w#\-*\s]+<)([\w#-*]+\s*){1,5}\s*)(</span>)?\s*</b>", True, "Searching for emphasized lines"], # Emphasized lines
|
||||
[r"[^'\"]?(\d+\.?\s+([\d\w-]+\:?\'?-?\s?){0,5})\s*", True, "Searching for numeric chapters with titles"], # Numeric Titles
|
||||
[r"\s*[^'\"]?([A-Z#]+(\s|-){0,3}){1,5}\s*", False, "Searching for chapters with Uppercase Characters" ] # Uppercase Chapters
|
||||
]
|
||||
|
||||
# Start with most typical chapter headings, get more aggressive until one works
|
||||
for [chapter_type, lookahead_ignorecase, log_message] in chapter_types:
|
||||
if self.html_preprocess_sections >= min_chapters:
|
||||
break
|
||||
full_chapter_line = chapter_line_open+chapter_header_open+chapter_type+chapter_header_close+chapter_line_close
|
||||
n_lookahead = re.sub("(ou|in|cha)", "lookahead_", full_chapter_line)
|
||||
self.log("Marked " + unicode(self.html_preprocess_sections) + " headings, " + log_message)
|
||||
if lookahead_ignorecase:
|
||||
chapter_marker = init_lookahead+full_chapter_line+blank_lines+n_lookahead_open+n_lookahead+n_lookahead_close+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close
|
||||
chapdetect = re.compile(r'%s' % chapter_marker, re.IGNORECASE)
|
||||
else:
|
||||
chapter_marker = init_lookahead+full_chapter_line+blank_lines+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close+n_lookahead_open+n_lookahead+n_lookahead_close
|
||||
chapdetect = re.compile(r'%s' % chapter_marker, re.UNICODE)
|
||||
|
||||
html = chapdetect.sub(self.chapter_head, html)
|
||||
html = self.markup_chapters(html, totalwords, blanks_between_paragraphs)
|
||||
|
||||
|
||||
###### Unwrap lines ######
|
||||
@ -247,7 +314,7 @@ class PreProcessor(object):
|
||||
# Calculate Length
|
||||
unwrap_factor = getattr(self.extra_opts, 'html_unwrap_factor', 0.4)
|
||||
length = docanalysis.line_length(unwrap_factor)
|
||||
self.log("*** Median line length is " + unicode(length) + ", calculated with " + format + " format ***")
|
||||
self.log("Median line length is " + unicode(length) + ", calculated with " + format + " format")
|
||||
# only go through unwrapping code if the histogram shows unwrapping is required or if the user decreased the default unwrap_factor
|
||||
if hardbreaks or unwrap_factor < 0.4:
|
||||
self.log("Unwrapping required, unwrapping Lines")
|
||||
@ -260,8 +327,7 @@ class PreProcessor(object):
|
||||
self.log("Done dehyphenating")
|
||||
# Unwrap lines using punctation and line length
|
||||
#unwrap_quotes = re.compile(u"(?<=.{%i}\"')\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*(?=[a-z])" % length, re.UNICODE)
|
||||
unwrap = re.compile(u"(?<=.{%i}([a-zäëïöüàèìòùáćéíóńśúâêîôûçąężı,:)\IA\u00DF]|(?<!\&\w{4});))\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE)
|
||||
html = unwrap.sub(' ', html)
|
||||
html = self.punctuation_unwrap(length, html, 'html')
|
||||
#check any remaining hyphens, but only unwrap if there is a match
|
||||
dehyphenator = Dehyphenator()
|
||||
html = dehyphenator(html,'html_cleanup', length)
|
||||
@ -276,7 +342,7 @@ class PreProcessor(object):
|
||||
html = re.sub(u'\xad\s*(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*', '', html)
|
||||
|
||||
# If still no sections after unwrapping mark split points on lines with no punctuation
|
||||
if self.html_preprocess_sections < 5:
|
||||
if self.html_preprocess_sections < self.min_chapters:
|
||||
self.log("Looking for more split points based on punctuation,"
|
||||
" currently have " + unicode(self.html_preprocess_sections))
|
||||
chapdetect3 = re.compile(r'<(?P<styles>(p|div)[^>]*)>\s*(?P<section>(<span[^>]*>)?\s*(?!([*#•]+\s*)+)(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*.?(?=[a-z#\-*\s]+<)([a-z#-*]+\s*){1,5}\s*\s*(</span>)?(</[ibu]>){0,2}\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</span>)?\s*</(p|div)>)', re.IGNORECASE)
|
||||
@ -291,6 +357,6 @@ class PreProcessor(object):
|
||||
html = blankreg.sub('\n'+r'\g<openline>'+u'\u00a0'+r'\g<closeline>', html)
|
||||
|
||||
# Center separator lines
|
||||
html = re.sub(u'<p>\s*(?P<break>([*#•]+\s*)+)\s*</p>', '<p style="text-align:center">' + '\g<break>' + '</p>', html)
|
||||
html = re.sub(u'<(?P<outer>p|div)[^>]*>\s*(<(?P<inner1>font|span|[ibu])[^>]*>)?\s*(<(?P<inner2>font|span|[ibu])[^>]*>)?\s*(<(?P<inner3>font|span|[ibu])[^>]*>)?\s*(?P<break>([*#•]+\s*)+)\s*(</(?P=inner3)>)?\s*(</(?P=inner2)>)?\s*(</(?P=inner1)>)?\s*</(?P=outer)>', '<p style="text-align:center">' + '\g<break>' + '</p>', html)
|
||||
|
||||
return html
|
||||
|
@ -40,7 +40,7 @@ class FB2MLizer(object):
|
||||
# in different directories. FB2 images are all in a flat layout so we rename all images
|
||||
# into a sequential numbering system to ensure there are no collisions between image names.
|
||||
self.image_hrefs = {}
|
||||
# Mapping of toc items and their
|
||||
# Mapping of toc items and their
|
||||
self.toc = {}
|
||||
# Used to see whether a new <section> needs to be opened
|
||||
self.section_level = 0
|
||||
@ -50,7 +50,7 @@ class FB2MLizer(object):
|
||||
self.oeb_book = oeb_book
|
||||
self.opts = opts
|
||||
self.reset_state()
|
||||
|
||||
|
||||
# Used for adding <section>s and <title>s to allow readers
|
||||
# to generate toc from the document.
|
||||
if self.opts.sectionize == 'toc':
|
||||
@ -74,20 +74,20 @@ class FB2MLizer(object):
|
||||
text = re.sub(r'(?miu)<p>\s*</p>', '', text)
|
||||
text = re.sub(r'(?miu)\s*</p>', '</p>', text)
|
||||
text = re.sub(r'(?miu)</p>\s*<p>', '</p>\n\n<p>', text)
|
||||
|
||||
|
||||
text = re.sub(r'(?miu)<title>\s*</title>', '', text)
|
||||
text = re.sub(r'(?miu)\s+</title>', '</title>', text)
|
||||
|
||||
|
||||
text = re.sub(r'(?miu)<section>\s*</section>', '', text)
|
||||
text = re.sub(r'(?miu)\s*</section>', '\n</section>', text)
|
||||
text = re.sub(r'(?miu)</section>\s*', '</section>\n\n', text)
|
||||
text = re.sub(r'(?miu)\s*<section>', '\n<section>', text)
|
||||
text = re.sub(r'(?miu)<section>\s*', '<section>\n', text)
|
||||
text = re.sub(r'(?miu)</section><section>', '</section>\n\n<section>', text)
|
||||
|
||||
|
||||
if self.opts.insert_blank_line:
|
||||
text = re.sub(r'(?miu)</p>', '</p><empty-line />', text)
|
||||
|
||||
|
||||
return text
|
||||
|
||||
def fb2_header(self):
|
||||
@ -101,6 +101,7 @@ class FB2MLizer(object):
|
||||
metadata['date'] = '%i.%i.%i' % (datetime.now().day, datetime.now().month, datetime.now().year)
|
||||
metadata['lang'] = u''.join(self.oeb_book.metadata.lang) if self.oeb_book.metadata.lang else 'en'
|
||||
metadata['id'] = None
|
||||
metadata['cover'] = self.get_cover()
|
||||
|
||||
author_parts = self.oeb_book.metadata.creator[0].value.split(' ')
|
||||
if len(author_parts) == 1:
|
||||
@ -120,10 +121,11 @@ class FB2MLizer(object):
|
||||
break
|
||||
if metadata['id'] is None:
|
||||
self.log.warn('No UUID identifier found')
|
||||
metadata['id'] = str(uuid.uuid4())
|
||||
metadata['id'] = str(uuid.uuid4())
|
||||
|
||||
for key, value in metadata.items():
|
||||
metadata[key] = prepare_string_for_xml(value)
|
||||
if not key == 'cover':
|
||||
metadata[key] = prepare_string_for_xml(value)
|
||||
|
||||
return u'<FictionBook xmlns="http://www.gribuser.ru/xml/fictionbook/2.0" xmlns:xlink="http://www.w3.org/1999/xlink">' \
|
||||
'<description>' \
|
||||
@ -135,6 +137,7 @@ class FB2MLizer(object):
|
||||
'<last-name>%(author_last)s</last-name>' \
|
||||
'</author>' \
|
||||
'<book-title>%(title)s</book-title>' \
|
||||
'%(cover)s' \
|
||||
'<lang>%(lang)s</lang>' \
|
||||
'</title-info>' \
|
||||
'<document-info>' \
|
||||
@ -153,31 +156,64 @@ class FB2MLizer(object):
|
||||
def fb2_footer(self):
|
||||
return u'</FictionBook>'
|
||||
|
||||
def get_cover(self):
|
||||
cover_href = None
|
||||
|
||||
# Get the raster cover if it's available.
|
||||
if self.oeb_book.metadata.cover and unicode(self.oeb_book.metadata.cover[0]) in self.oeb_book.manifest.ids:
|
||||
id = unicode(self.oeb_book.metadata.cover[0])
|
||||
cover_item = self.oeb_book.manifest.ids[id]
|
||||
if cover_item.media_type in OEB_RASTER_IMAGES:
|
||||
cover_href = cover_item.href
|
||||
else:
|
||||
# Figure out if we have a title page or a cover page
|
||||
page_name = ''
|
||||
if 'titlepage' in self.oeb_book.guide:
|
||||
page_name = 'titlepage'
|
||||
elif 'cover' in self.oeb_book.guide:
|
||||
page_name = 'cover'
|
||||
|
||||
if page_name:
|
||||
cover_item = self.oeb_book.manifest.hrefs[self.oeb_book.guide[page_name].href]
|
||||
# Get the first image in the page
|
||||
for img in cover_item.xpath('//img'):
|
||||
cover_href = cover_item.abshref(img.get('src'))
|
||||
break
|
||||
|
||||
if cover_href:
|
||||
# Only write the image tag if it is in the manifest.
|
||||
if cover_href in self.oeb_book.manifest.hrefs.keys():
|
||||
if cover_href not in self.image_hrefs.keys():
|
||||
self.image_hrefs[cover_href] = '_%s.jpg' % len(self.image_hrefs.keys())
|
||||
return u'<coverpage><image xlink:href="#%s" /></coverpage>' % self.image_hrefs[cover_href]
|
||||
|
||||
return u''
|
||||
|
||||
def get_text(self):
|
||||
text = ['<body>']
|
||||
|
||||
|
||||
# Create main section if there are no others to create
|
||||
if self.opts.sectionize == 'nothing':
|
||||
text.append('<section>')
|
||||
self.section_level += 1
|
||||
|
||||
|
||||
for item in self.oeb_book.spine:
|
||||
self.log.debug('Converting %s to FictionBook2 XML' % item.href)
|
||||
stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts, self.opts.output_profile)
|
||||
|
||||
|
||||
# Start a <section> if we must sectionize each file or if the TOC references this page
|
||||
page_section_open = False
|
||||
if self.opts.sectionize == 'files' or self.toc.get(item.href) == 'page':
|
||||
text.append('<section>')
|
||||
page_section_open = True
|
||||
self.section_level += 1
|
||||
|
||||
|
||||
text += self.dump_text(item.data.find(XHTML('body')), stylizer, item)
|
||||
|
||||
|
||||
if page_section_open:
|
||||
text.append('</section>')
|
||||
self.section_level -= 1
|
||||
|
||||
|
||||
# Close any open sections
|
||||
while self.section_level > 0:
|
||||
text.append('</section>')
|
||||
@ -200,8 +236,10 @@ class FB2MLizer(object):
|
||||
im = Image()
|
||||
im.load(item.data)
|
||||
im.set_compression_quality(70)
|
||||
data = im.export('jpg')
|
||||
raw_data = b64encode(data)
|
||||
imdata = im.export('jpg')
|
||||
raw_data = b64encode(imdata)
|
||||
else:
|
||||
raw_data = b64encode(item.data)
|
||||
# Don't put the encoded image on a single line.
|
||||
data = ''
|
||||
col = 1
|
||||
@ -314,7 +352,7 @@ class FB2MLizer(object):
|
||||
self.toc[page.href] = None
|
||||
elif toc_entry and elem_tree.attrib.get('id', None):
|
||||
newlevel = toc_entry.get(elem_tree.attrib.get('id', None), None)
|
||||
|
||||
|
||||
# Start a new section if necessary
|
||||
if newlevel:
|
||||
if not (newlevel > self.section_level):
|
||||
|
@ -41,17 +41,24 @@ class FB2Input(InputFormatPlugin):
|
||||
from calibre.ebooks.metadata.opf2 import OPFCreator
|
||||
from calibre.ebooks.metadata.meta import get_metadata
|
||||
from calibre.ebooks.oeb.base import XLINK_NS, XHTML_NS, RECOVER_PARSER
|
||||
from calibre.ebooks.chardet import xml_to_unicode
|
||||
NAMESPACES = {'f':FB2NS, 'l':XLINK_NS}
|
||||
log.debug('Parsing XML...')
|
||||
raw = stream.read().replace('\0', '')
|
||||
raw = xml_to_unicode(raw, strip_encoding_pats=True,
|
||||
assume_utf8=True, resolve_entities=True)[0]
|
||||
try:
|
||||
doc = etree.fromstring(raw)
|
||||
except etree.XMLSyntaxError:
|
||||
try:
|
||||
doc = etree.fromstring(raw, parser=RECOVER_PARSER)
|
||||
if doc is None:
|
||||
raise Exception('parse failed')
|
||||
except:
|
||||
doc = etree.fromstring(raw.replace('& ', '&'),
|
||||
parser=RECOVER_PARSER)
|
||||
if doc is None:
|
||||
raise ValueError('The FB2 file is not valid XML')
|
||||
stylesheets = doc.xpath('//*[local-name() = "stylesheet" and @type="text/css"]')
|
||||
css = ''
|
||||
for s in stylesheets:
|
||||
@ -97,13 +104,17 @@ class FB2Input(InputFormatPlugin):
|
||||
entries = [(f, guess_type(f)[0]) for f in os.listdir('.')]
|
||||
opf.create_manifest(entries)
|
||||
opf.create_spine(['index.xhtml'])
|
||||
|
||||
for img in doc.xpath('//f:coverpage/f:image', namespaces=NAMESPACES):
|
||||
href = img.get('{%s}href'%XLINK_NS, img.get('href', None))
|
||||
if href is not None:
|
||||
if href.startswith('#'):
|
||||
href = href[1:]
|
||||
opf.guide.set_cover(os.path.abspath(href))
|
||||
if mi.cover_data and mi.cover_data[1]:
|
||||
with open('fb2_cover_calibre_mi.jpg', 'wb') as f:
|
||||
f.write(mi.cover_data[1])
|
||||
opf.guide.set_cover(os.path.abspath('fb2_cover_calibre_mi.jpg'))
|
||||
else:
|
||||
for img in doc.xpath('//f:coverpage/f:image', namespaces=NAMESPACES):
|
||||
href = img.get('{%s}href'%XLINK_NS, img.get('href', None))
|
||||
if href is not None:
|
||||
if href.startswith('#'):
|
||||
href = href[1:]
|
||||
opf.guide.set_cover(os.path.abspath(href))
|
||||
|
||||
opf.render(open('metadata.opf', 'wb'))
|
||||
return os.path.join(os.getcwd(), 'metadata.opf')
|
||||
|
@ -35,7 +35,7 @@ class FB2Output(OutputFormatPlugin):
|
||||
rasterizer = SVGRasterizer()
|
||||
rasterizer(oeb_book, opts)
|
||||
except Unavailable:
|
||||
self.log.warn('SVG rasterizer unavailable, SVG will not be converted')
|
||||
log.warn('SVG rasterizer unavailable, SVG will not be converted')
|
||||
|
||||
linearize_jacket(oeb_book)
|
||||
|
||||
|
@ -119,7 +119,7 @@ class HTMLFile(object):
|
||||
|
||||
self.is_binary = level > 0 and not bool(self.HTML_PAT.search(src[:4096]))
|
||||
if not self.is_binary:
|
||||
if encoding is None:
|
||||
if not encoding:
|
||||
encoding = xml_to_unicode(src[:4096], verbose=verbose)[-1]
|
||||
self.encoding = encoding
|
||||
else:
|
||||
|
@ -16,6 +16,7 @@ from calibre.ebooks.metadata.book import TOP_LEVEL_CLASSIFIERS
|
||||
from calibre.ebooks.metadata.book import ALL_METADATA_FIELDS
|
||||
from calibre.library.field_metadata import FieldMetadata
|
||||
from calibre.utils.date import isoformat, format_date
|
||||
from calibre.utils.icu import sort_key
|
||||
from calibre.utils.formatter import TemplateFormatter
|
||||
|
||||
|
||||
@ -38,15 +39,16 @@ class SafeFormat(TemplateFormatter):
|
||||
|
||||
def get_value(self, key, args, kwargs):
|
||||
try:
|
||||
key = key.lower()
|
||||
if key != 'title_sort':
|
||||
key = field_metadata.search_term_to_field_key(key.lower())
|
||||
key = field_metadata.search_term_to_field_key(key)
|
||||
b = self.book.get_user_metadata(key, False)
|
||||
if b and b['datatype'] == 'int' and self.book.get(key, 0) == 0:
|
||||
v = ''
|
||||
elif b and b['datatype'] == 'float' and self.book.get(key, 0.0) == 0.0:
|
||||
v = ''
|
||||
else:
|
||||
ign, v = self.book.format_field(key.lower(), series_with_index=False)
|
||||
ign, v = self.book.format_field(key, series_with_index=False)
|
||||
if v is None:
|
||||
return ''
|
||||
if v == '':
|
||||
@ -159,6 +161,11 @@ class Metadata(object):
|
||||
try:
|
||||
return self.__getattribute__(field)
|
||||
except AttributeError:
|
||||
if field.startswith('#') and field.endswith('_index'):
|
||||
try:
|
||||
return self.get_extra(field[:-6])
|
||||
except:
|
||||
pass
|
||||
return default
|
||||
|
||||
def get_extra(self, field):
|
||||
@ -317,14 +324,16 @@ class Metadata(object):
|
||||
if metadata is None:
|
||||
traceback.print_stack()
|
||||
return
|
||||
metadata = copy.deepcopy(metadata)
|
||||
if '#value#' not in metadata:
|
||||
if metadata['datatype'] == 'text' and metadata['is_multiple']:
|
||||
metadata['#value#'] = []
|
||||
m = {}
|
||||
for k in metadata:
|
||||
m[k] = copy.copy(metadata[k])
|
||||
if '#value#' not in m:
|
||||
if m['datatype'] == 'text' and m['is_multiple']:
|
||||
m['#value#'] = []
|
||||
else:
|
||||
metadata['#value#'] = None
|
||||
m['#value#'] = None
|
||||
_data = object.__getattribute__(self, '_data')
|
||||
_data['user_metadata'][field] = metadata
|
||||
_data['user_metadata'][field] = m
|
||||
|
||||
def template_to_attribute(self, other, ops):
|
||||
'''
|
||||
@ -484,7 +493,7 @@ class Metadata(object):
|
||||
return authors_to_string(self.authors)
|
||||
|
||||
def format_tags(self):
|
||||
return u', '.join([unicode(t) for t in self.tags])
|
||||
return u', '.join([unicode(t) for t in sorted(self.tags, key=sort_key)])
|
||||
|
||||
def format_rating(self):
|
||||
return unicode(self.rating)
|
||||
@ -524,7 +533,7 @@ class Metadata(object):
|
||||
orig_res = res
|
||||
datatype = cmeta['datatype']
|
||||
if datatype == 'text' and cmeta['is_multiple']:
|
||||
res = u', '.join(res)
|
||||
res = u', '.join(sorted(res, key=sort_key))
|
||||
elif datatype == 'series' and series_with_index:
|
||||
if self.get_extra(key) is not None:
|
||||
res = res + \
|
||||
@ -554,7 +563,7 @@ class Metadata(object):
|
||||
elif key == 'series_index':
|
||||
res = self.format_series_index(res)
|
||||
elif datatype == 'text' and fmeta['is_multiple']:
|
||||
res = u', '.join(res)
|
||||
res = u', '.join(sorted(res, key=sort_key))
|
||||
elif datatype == 'series' and series_with_index:
|
||||
res = res + ' [%s]'%self.format_series_index()
|
||||
elif datatype == 'datetime':
|
||||
|
@ -9,6 +9,7 @@ import mimetypes, os
|
||||
from base64 import b64decode
|
||||
from lxml import etree
|
||||
from calibre.ebooks.metadata import MetaInformation
|
||||
from calibre.ebooks.chardet import xml_to_unicode
|
||||
|
||||
XLINK_NS = 'http://www.w3.org/1999/xlink'
|
||||
def XLINK(name):
|
||||
@ -23,7 +24,10 @@ def get_metadata(stream):
|
||||
tostring = lambda x : etree.tostring(x, method='text',
|
||||
encoding=unicode).strip()
|
||||
parser = etree.XMLParser(recover=True, no_network=True)
|
||||
root = etree.fromstring(stream.read(), parser=parser)
|
||||
raw = stream.read()
|
||||
raw = xml_to_unicode(raw, strip_encoding_pats=True,
|
||||
assume_utf8=True)[0]
|
||||
root = etree.fromstring(raw, parser=parser)
|
||||
authors, author_sort = [], None
|
||||
for au in XPath('//fb2:author')(root):
|
||||
fname = lname = author = None
|
||||
|
@ -17,10 +17,10 @@ BASE_URL = 'http://isbndb.com/api/books.xml?access_key=%(key)s&page_number=1&res
|
||||
class ISBNDBError(Exception):
|
||||
pass
|
||||
|
||||
def fetch_metadata(url, max=100, timeout=5.):
|
||||
def fetch_metadata(url, max=3, timeout=5.):
|
||||
books = []
|
||||
page_number = 1
|
||||
total_results = sys.maxint
|
||||
total_results = 31
|
||||
br = browser()
|
||||
while len(books) < total_results and max > 0:
|
||||
try:
|
||||
|
@ -10,7 +10,8 @@ from calibre.ebooks.metadata import MetaInformation, string_to_authors
|
||||
title_pat = re.compile(r'\{\\info.*?\{\\title(.*?)(?<!\\)\}', re.DOTALL)
|
||||
author_pat = re.compile(r'\{\\info.*?\{\\author(.*?)(?<!\\)\}', re.DOTALL)
|
||||
comment_pat = re.compile(r'\{\\info.*?\{\\subject(.*?)(?<!\\)\}', re.DOTALL)
|
||||
category_pat = re.compile(r'\{\\info.*?\{\\category(.*?)(?<!\\)\}', re.DOTALL)
|
||||
tags_pat = re.compile(r'\{\\info.*?\{\\category(.*?)(?<!\\)\}', re.DOTALL)
|
||||
publisher_pat = re.compile(r'\{\\info.*?\{\\manager(.*?)(?<!\\)\}', re.DOTALL)
|
||||
|
||||
def get_document_info(stream):
|
||||
"""
|
||||
@ -82,61 +83,73 @@ def decode(raw, codec):
|
||||
|
||||
def get_metadata(stream):
|
||||
""" Return metadata as a L{MetaInfo} object """
|
||||
title, author, comment, category = None, None, None, None
|
||||
stream.seek(0)
|
||||
if stream.read(5) != r'{\rtf':
|
||||
return MetaInformation(None, None)
|
||||
return MetaInformation(_('Unknown'))
|
||||
block = get_document_info(stream)[0]
|
||||
if not block:
|
||||
return MetaInformation(None, None)
|
||||
return MetaInformation(_('Unknown'))
|
||||
|
||||
stream.seek(0)
|
||||
cpg = detect_codepage(stream)
|
||||
stream.seek(0)
|
||||
|
||||
title_match = title_pat.search(block)
|
||||
if title_match:
|
||||
if title_match is not None:
|
||||
title = decode(title_match.group(1).strip(), cpg)
|
||||
else:
|
||||
title = _('Unknown')
|
||||
author_match = author_pat.search(block)
|
||||
if author_match:
|
||||
if author_match is not None:
|
||||
author = decode(author_match.group(1).strip(), cpg)
|
||||
comment_match = comment_pat.search(block)
|
||||
if comment_match:
|
||||
comment = decode(comment_match.group(1).strip(), cpg)
|
||||
category_match = category_pat.search(block)
|
||||
if category_match:
|
||||
category = decode(category_match.group(1).strip(), cpg)
|
||||
mi = MetaInformation(title, author)
|
||||
else:
|
||||
author = None
|
||||
mi = MetaInformation(title)
|
||||
if author:
|
||||
mi.authors = string_to_authors(author)
|
||||
mi.comments = comment
|
||||
mi.category = category
|
||||
|
||||
comment_match = comment_pat.search(block)
|
||||
if comment_match is not None:
|
||||
comment = decode(comment_match.group(1).strip(), cpg)
|
||||
mi.comments = comment
|
||||
tags_match = tags_pat.search(block)
|
||||
if tags_match is not None:
|
||||
tags = decode(tags_match.group(1).strip(), cpg)
|
||||
mi.tags = tags
|
||||
publisher_match = publisher_pat.search(block)
|
||||
if publisher_match is not None:
|
||||
publisher = decode(publisher_match.group(1).strip(), cpg)
|
||||
mi.publisher = publisher
|
||||
|
||||
return mi
|
||||
|
||||
|
||||
def create_metadata(stream, options):
|
||||
md = r'{\info'
|
||||
md = [r'{\info']
|
||||
if options.title:
|
||||
title = options.title.encode('ascii', 'ignore')
|
||||
md += r'{\title %s}'%(title,)
|
||||
md.append(r'{\title %s}'%(title,))
|
||||
if options.authors:
|
||||
au = options.authors
|
||||
if not isinstance(au, basestring):
|
||||
au = u', '.join(au)
|
||||
author = au.encode('ascii', 'ignore')
|
||||
md += r'{\author %s}'%(author,)
|
||||
if options.get('category', None):
|
||||
category = options.category.encode('ascii', 'ignore')
|
||||
md += r'{\category %s}'%(category,)
|
||||
md.append(r'{\author %s}'%(author,))
|
||||
comp = options.comment if hasattr(options, 'comment') else options.comments
|
||||
if comp:
|
||||
comment = comp.encode('ascii', 'ignore')
|
||||
md += r'{\subject %s}'%(comment,)
|
||||
if len(md) > 6:
|
||||
md += '}'
|
||||
md.append(r'{\subject %s}'%(comment,))
|
||||
if options.publisher:
|
||||
publisher = options.publisher.encode('ascii', 'ignore')
|
||||
md.append(r'{\manager %s}'%(publisher,))
|
||||
if options.tags:
|
||||
tags = u', '.join(options.tags)
|
||||
tags = tags.encode('ascii', 'ignore')
|
||||
md.append(r'{\category %s}'%(tags,))
|
||||
if len(md) > 1:
|
||||
md.append('}')
|
||||
stream.seek(0)
|
||||
src = stream.read()
|
||||
ans = src[:6] + md + src[6:]
|
||||
ans = src[:6] + u''.join(md) + src[6:]
|
||||
stream.seek(0)
|
||||
stream.write(ans)
|
||||
|
||||
@ -156,7 +169,7 @@ def set_metadata(stream, options):
|
||||
|
||||
base_pat = r'\{\\name(.*?)(?<!\\)\}'
|
||||
title = options.title
|
||||
if title != None:
|
||||
if title is not None:
|
||||
title = title.encode('ascii', 'replace')
|
||||
pat = re.compile(base_pat.replace('name', 'title'), re.DOTALL)
|
||||
if pat.search(src):
|
||||
@ -164,7 +177,7 @@ def set_metadata(stream, options):
|
||||
else:
|
||||
src = add_metadata_item(src, 'title', title)
|
||||
comment = options.comments
|
||||
if comment != None:
|
||||
if comment is not None:
|
||||
comment = comment.encode('ascii', 'replace')
|
||||
pat = re.compile(base_pat.replace('name', 'subject'), re.DOTALL)
|
||||
if pat.search(src):
|
||||
@ -172,7 +185,7 @@ def set_metadata(stream, options):
|
||||
else:
|
||||
src = add_metadata_item(src, 'subject', comment)
|
||||
author = options.authors
|
||||
if author != None:
|
||||
if author is not None:
|
||||
author = ', '.join(author)
|
||||
author = author.encode('ascii', 'ignore')
|
||||
pat = re.compile(base_pat.replace('name', 'author'), re.DOTALL)
|
||||
@ -180,14 +193,23 @@ def set_metadata(stream, options):
|
||||
src = pat.sub(r'{\\author ' + author + r'}', src)
|
||||
else:
|
||||
src = add_metadata_item(src, 'author', author)
|
||||
category = options.get('category', None)
|
||||
if category != None:
|
||||
category = category.encode('ascii', 'replace')
|
||||
tags = options.tags
|
||||
if tags is not None:
|
||||
tags = ', '.join(tags)
|
||||
tags = tags.encode('ascii', 'replace')
|
||||
pat = re.compile(base_pat.replace('name', 'category'), re.DOTALL)
|
||||
if pat.search(src):
|
||||
src = pat.sub(r'{\\category ' + category + r'}', src)
|
||||
src = pat.sub(r'{\\category ' + tags + r'}', src)
|
||||
else:
|
||||
src = add_metadata_item(src, 'category', category)
|
||||
src = add_metadata_item(src, 'category', tags)
|
||||
publisher = options.publisher
|
||||
if publisher is not None:
|
||||
publisher = publisher.encode('ascii', 'replace')
|
||||
pat = re.compile(base_pat.replace('name', 'manager'), re.DOTALL)
|
||||
if pat.search(src):
|
||||
src = pat.sub(r'{\\manager ' + publisher + r'}', src)
|
||||
else:
|
||||
src = add_metadata_item(src, 'manager', publisher)
|
||||
stream.seek(pos + olen)
|
||||
after = stream.read()
|
||||
stream.seek(pos)
|
||||
|
@ -18,7 +18,6 @@ class xISBN(object):
|
||||
self._data = []
|
||||
self._map = {}
|
||||
|
||||
self.br = browser()
|
||||
self.isbn_pat = re.compile(r'[^0-9X]', re.IGNORECASE)
|
||||
|
||||
def purify(self, isbn):
|
||||
@ -26,7 +25,7 @@ class xISBN(object):
|
||||
|
||||
def fetch_data(self, isbn):
|
||||
url = self.QUERY%isbn
|
||||
data = self.br.open_novisit(url).read()
|
||||
data = browser().open_novisit(url).read()
|
||||
data = json.loads(data)
|
||||
if data.get('stat', None) != 'ok':
|
||||
return []
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user