Merge prior to trunk merge.

This commit is contained in:
Starson17 2011-01-18 09:29:19 -05:00
commit 4c0865ea71
320 changed files with 131754 additions and 84361 deletions

View File

@ -4,6 +4,342 @@
# for important features/bug fixes. # for important features/bug fixes.
# Also, each release can have new and improved recipes. # Also, each release can have new and improved recipes.
- version: 0.7.40
date: 2011-01-14
new features:
- title: "A new 'highlight matches' search mode"
description: >
"There is now a checkbox next to the search bar named 'Highlight'. If you check it, searching will highlight
all matched books instead of filtering the book list to all matched books."
- title: "RTF Input: Improved support for conversion of images. The bug where some images were shrunk should no longer happen"
- title: "Template language: Allow you to create your own formatting functions. Accessible via Preferences->Advanced->Template functions"
- title: "News download: Convert various HTML 5 tags into <div> to support readers that cannot handle HTML 5 tags"
- title: "RTF metadata: Add support for publisher and tags."
tickets: [6657]
- title: "BibTeX catalog: Add support for custom columns"
- title: "TXT Input: Support for textile markup"
- title: "Various minor tweaks to improve usability of Preferences->Plugins"
- title: "TXT Output: Convert <hr> to scene break marker."
- title: "Support for the Archos 70"
- title: "SONY Driver: Add an option to automatically refresh the covers on every connect. Accessible via: Preferences->Plugins->Device interface plugins"
- title: "Add access to the larger template editor from plugboards via context menu."
- title: "Speed improvement when connecting a large library to a device"
- title: "Speedup when searching on multiple words in a large library"
- title: "TXT Input: Add a heauristic formatting processor"
bug fixes:
- title: "Fix bug that caused automatic news removal to remove any book that has a tag that contains the word 'news' instead of only books that have the tag News"
- title: "Refactor the downloading social metadata message box to allow canceling."
tickets: [8234]
- title: "Kobo drive does not deal with Null value in DateCreated column"
tickets: [8308]
- title: "MOBI Input: Fix regression that caused images placed inside svg tags to be discarded"
- title: "Fix selecting Tablet output profile would actually select the Samsung Galaxy S profile"
- title: "Catalog generation: Fix a condition that could cause TOCs to not be properly generated in MOBI format catalogs"
tickets: [8295]
- title: "Zip file reading: Be more tolerant when a zip file has a damaged file directory"
- title: "RTF Input: Various code cleanups. Go back to trying to handle unicode mappings without pre-processing. This will mean that some RTF files that used to convert, won't anymore. Please open tickets and attach them."
tickets: [8171]
- title: "ImageMagick: When identifying an image don't read the entire image"
- title: "FB2 Output: Add cover to FB2 metadata."
- title: "Fix inability to customize builting recipe when more than one recipe has the same name"
tickets: [8281]
- title: "RTF Input: Fix regression that broke the Preprocess HTML option"
- title: "Fix XSS vulnerability in content server."
tickets: [7980]
- title: "TXT Output: Clean up and produce consistant output. Spacing around headings. Headings are not indented when using the remove paragraph spacing option."
- title: "Catalog generation: Handle invalid covers gracefully"
- title: "Email settings: Before displaying the email test dialog warn the user that it will expose their email password"
- title: "PDB Output: Fix regression that caused some PDB files to not work with other software"
tickets: [8231]
improved recipes:
- Financial Times UK
- Globe and Mail
- Wired Daily
- MIT Technology Review
- MSNBC
- expansion.com
- New York Times
- Heraldo de Aragon
- Exiled online
new recipes:
- title: "Yakima Herald and Tri-City Herald"
author: "Laura Gjovaag"
- title: "Wichita Eagle"
author: "Jason Cameron"
- title: "Pressthink and Zero Hedge"
author: "Darko Miletic"
- title: "tyzden"
author: "zemiak"
- title: "El Correo"
author: "desUBIKado"
- title: "Cicero"
author: "mad"
- title: "El Publico"
author: "Gerardo Diez"
- version: 0.7.38
date: 2011-01-07
new features:
- title: "Reduce startup time when using a composite custom column"
- title: "Template language: Add a list_item function for use with tags like columns. See User Manual for details"
- title: "TXT Input: Attempt to detect the input encoding when not specified. Auto detect paragraph structure and formatting markup."
- title: "Search & replace: Add ability to manipulate number and boolean columns."
- title: "Add type ahead completion to the advanced search dialog."
tickets: [8035]
- title: "Double click on plugin in Preferences dialog to customize"
tickets: [8175]
- title: "Allow customization of the SONY driver to send thumbnail to the device. Useful with newer SONY readers"
tickets: [8161]
- title: "Smarten punctuation: Convert double dashes to em dashes. Preprocessing: Various tweaks"
bug fixes:
- title: "Fix regression causing the template formatter to intepret a missing format letter as ERROR instead of 's'."
- title: "Fix regression that broke conversion of PNG images in PDF files on OS X."
tickets: [8215]
- title: "Content server: Fix improper XML escaping of category titles in the OPDS feeds"
tickets: [8225]
- title: "When decoding XML if the XML starts with a UTF-8 BOM decode as UTF-8. Fixes parsing of FB2 files with UTF-8 BOMs"
- title: "E-book viewer: When scrolling to a bookmark and the content is wider than the window, do not scroll in the horizontal direction"
- title: "E-book viewer: Fix next page skipping the bottom of chapters when the content is wider than the window."
tickets: [8153]
- title: " FB2 Output: Insert covers."
tickets: [8172]
- title: "Content server: When serving OPDS feeds handle html descriptions that have namespaced attributes."
tickets: [7938]
- title: "When downloading metadata from isbndb.com, download a maximum of 30 results rather than 1000"
- title: "Fix sorting of tags column"
- title: "Change search/replace to show commas instead of vertical bars as the separator for multiple authors"
- title: "Template language: Make all column names case insensitive"
- title: "Fix bug that prevent the Disabled option for Tag Browser partiotining from working in the Preferences dialog"
- title: "Fix bug when using tags like custom column in the template language"
- title: "Fix bug where composite custom columns using general_program_mode fields are not evaluated correctly when used in a template."
- title: "ImageMagick interface: Don't crash when asked to open empty image files"
- title: "Kobo driver: Add TXT,CBZ,CBR to supported formats list"
tickets: [8124]
- title: "Don't uneccessarily scroll the book list horizontally when re-selcting previously selected rows."
new recipes:
- title: "New London Day"
author: "Being"
- title: "Walla"
author: "marbs"
- title: "New Journal of Physics"
author: "Chema Cortes"
- title: "The Baltimore Sun"
author: "Josh Hall"
- title: "Arabian Business and Sunday Times (UK)"
author: "Darko Miletic"
- title: "Deia"
author: "Gerardo Diez"
- title: "Smarter Planet"
author: "Jack Mason"
improved recipes:
- The Atlantic
- Danas
- Ledevoir
- version: 0.7.37
date: 2011-01-02
new features:
- title: "This realease is mostly a bug fix release to fix various things that got broken by all the changes in 0.7.36"
- title: "Tag browser: Move the configuration of the sub-category grouping from tweaks to the Preferences dialog"
- title: "Tag browser: Allow changing the sub-categorization scheme from the right click menu"
bug fixes:
- title: "Fix regression in 0.7.36 that caused the Tag Browser to break if you have items in it with empty sort values"
- title: "Catalog generation: Fix various regressions introduced in 0.7.36 on windows"
description: >
"Database integrity check not working after catalog generation. Catalog generation failing with a file in use error. Spurious question marks appearing in the catalog"
- title: "Catalog generation: Work on a copy of the library database so as not to lock it"
- title: "Catalog generation: Handle merge of comments + custom field when custom filed is None"
- title: "Fix regression that broke sort_columns_at_startup tweak in 0.7.36"
- title: "Tag Browser: Fix the Manage X items in the right click menu, which broke in 0.7.36"
- title: "Tag Browser: Fix grouping by name for authors"
- title: "Nook color: Fix main memory and SD card swapped in calibre"
tickets: [8159]
- title: "Fix regression in 0.7.36 that broke PDF Output when specifying a cover"
- title: "Catalog generation: Fix regression in MOBI catalog that caused it to not appear as periodical on Kindle"
- title: "Fix regression in 0.7.36 that broke opening the book details dialog by double clicking on the book details panel"
- version: 0.7.36
date: 2011-01-01
new features:
- title: "Tag browser: Add subcategories and search"
description: "When a category has many items, it will be automatically split up. Also add a search to quickly find an item in the Tag Browser. The sub categories can be controlled via preferences->Tweaks. Also add a button to collapse all categories"
type: major
- title: "Device drivers for the Google Nexus S, Motorola Backflip, Samsung Galaxy Tablet, PocketBook 603/903, EEEReader DR900 and the NextBook"
- title: "Tag editor dialog now remebers its last used size"
tickets: [8063]
- title: "OS X dmg: Add a symlink pointing to the Applications folder for easy installation"
tickets: [8052]
- title: "Catalog generation: CSV/XML catalogs now support custom fields. Also write UTF-8 BOM to CSV output file."
tickets: [8014]
- title: "EPUB/MOBI catalogs: Various new features"
description: "Added a custom field/value for excluding books, OR'd with existing tag list. Added a thumbnail width hint, from 1.0 - 2.0 inches. Deprecated support for special note tag '*', added support for custom column containing note to be inserted in Description header. Added 'Merge with comments' feature, which non-destructively combines Comments with a custom field when generating Descriptions. Moved Description header into a user-editable template file. All fields except thumb and comments accessible to template."
tickets: [7820, 5297, 6765]
- title: "SONY driver: Allow the creation of an All by Something category via the tweaks."
- title: "Add a tweak to control the delay when sending mails using gmail or hotmail."
tickets: [8064]
- title: "Add output encoding option for TXT/PDB/PMLX output plugins to the GUI"
- title: "Add an environment variable to control the temporary directory calibre uses"
- title: "Use the new HTML editor widget for comments custom columns as well"
- title: "Content server: Fix regression that broke saved searches"
tickets: [8047]
- title: "E-book viewer: Fix regression that broke previous page button"
- title: "Add a tweak to allow double clicking on the book list to open the edit metadata dialog"
tickets: [8032]
- title: "Add a tweak to use a template for formatting SONY collection names"
tickets: [8033]
- title: "Bulk edit metadata, search and replace: Show all values for multiple fields in the text region, separated by :::"
tickets: [8030]
- title: "Update user agent used by calibre when connecting to websites"
bug fixes:
- title: "FB2 Output: Fix regression that broke images in generated FB2 files"
tickets: [8142]
- title: "When unzipping zip files that contain filenames with unknown character encoding, sanitize the filenames correctly"
tickets: [8050]
- title: "TCR Output: Fix TCR compression adding junk to the end of the text. Remove compression level option."
- title: "PDF Output: Fix regression that broke the margin options."
- title: "FB2 Input: Handle non UTF-8 encodings on OS X"
tickets: [8115]
- title: "SNB Input: Better error handling if some metadata is missing in the SNB file. Add Wi-Fi connection support for the Bambook"
- title: "Allow hyperlinks to be clicked in comments metadata in the book details panel"
tickets: [8054]
improved recipes:
- Brand Eins
- Volksrant
- Smithsonian
- Business World
- El Universal
- Salon
- The Week
- EL Pais
- Wired Magazine
- Heraldo de Aragon
new recipes:
- title: "Karlsruhe News"
author: "tfeld"
- title: "El Periodico and Red Aragon"
author: "desUBIKado"
- title: "Business Insider"
author: "Darko Miletic"
- version: 0.7.35 - version: 0.7.35
date: 2010-12-23 date: 2010-12-23

View File

@ -0,0 +1,39 @@
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
__license__ = 'GPL v3'
__copyright__ = '2010, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
'''
Available fields:
{title} Title of the book
{series} Series name
{series_index} Number of the book in the series
{rating} Rating
{rating_parens} Rating, in parentheses
{pubyear} Year the book was published
{pubyear_parens} Year the book was published, in parentheses
'''
# Books by Author
by_authors_normal_title_template = '{title} {pubyear_parens}'
by_authors_series_title_template = '[{series_index}] {title} {pubyear_parens}'
# Books by Title
by_titles_normal_title_template = '{title}'
by_titles_series_title_template = '{title} ({series} [{series_index}])'
# Books by Series
by_series_title_template = '[{series_index}] {title} {pubyear_parens}'
# Books by Genre
by_genres_normal_title_template = '{title} {pubyear_parens}'
by_genres_series_title_template = '{series_index}. {title} {pubyear_parens}'
# Recently Added
by_recently_added_normal_title_template = '{title}'
by_recently_added_series_title_template = '{title} ({series} [{series_index}])'
# By Month added
by_month_added_normal_title_template = '{title} {pubyear_parens}'
by_month_added_series_title_template = '[{series_index}] {title} {pubyear_parens}'

View File

@ -2,19 +2,29 @@ body { background-color: white; }
p.title { p.title {
margin-top:0em; margin-top:0em;
margin-bottom:1em; margin-bottom:0em;
text-align:center; text-align:center;
font-style:italic; font-style:italic;
font-size:xx-large; font-size:xx-large;
border-bottom: solid black 2px; }
p.series_id {
margin-top:0em;
margin-bottom:0em;
text-align:center;
}
a.series_id {
font-style:normal;
font-size:large;
} }
p.author { p.author {
font-size:large;
margin-top:0em; margin-top:0em;
margin-bottom:0em; margin-bottom:0em;
text-align: center; text-align: center;
text-indent: 0em; text-indent: 0em;
font-size:large;
} }
p.author_index { p.author_index {
@ -26,7 +36,8 @@ p.author_index {
text-indent: 0em; text-indent: 0em;
} }
p.tags { p.genres {
font-style:normal;
margin-top:0.5em; margin-top:0.5em;
margin-bottom:0em; margin-bottom:0em;
text-align: left; text-align: left;
@ -108,6 +119,13 @@ p.date_read {
text-indent:-6em; text-indent:-6em;
} }
hr.annotations_divider {
width:50%;
margin-left:1em;
margin-top:0em;
margin-bottom:0em;
}
hr.description_divider { hr.description_divider {
width:90%; width:90%;
margin-left:5%; margin-left:5%;
@ -117,20 +135,37 @@ hr.description_divider {
border-left: solid white 0px; border-left: solid white 0px;
} }
hr.annotations_divider { hr.header_divider {
width:50%; width:100%;
margin-left:1em; border-top: solid white 1px;
margin-top:0em; border-right: solid white 0px;
margin-bottom:0em; border-bottom: solid black 2px;
border-left: solid white 0px;
}
hr.merged_comments_divider {
width:80%;
margin-left:10%;
border-top: solid white 0px;
border-right: solid white 0px;
border-bottom: dashed gray 2px;
border-left: solid white 0px;
} }
td.publisher, td.date { td.publisher, td.date {
font-weight:bold; font-weight:bold;
text-align:center; text-align:center;
} }
td.rating {
text-align: center; td.rating{
text-align:center;
} }
td.notes {
font-size: 100%;
text-align:center;
}
td.thumbnail img { td.thumbnail img {
-webkit-box-shadow: 4px 4px 12px #999; -webkit-box-shadow: 4px 4px 12px #999;
} }

View File

@ -0,0 +1,41 @@
<html xmlns="{xmlns}">
<head>
<title>{title_str}</title>
<meta name="catalog description header" http-equiv="Content-Type" content="text/html; charset=UTF-8" />
<link rel="stylesheet" type="text/css" href="stylesheet.css" media="screen" />
</head>
<body>
<p class="title">{title}</p>
<p class="series_id"><a class="series_id">{series} [{series_index}]</a></p>
<hr class="header_divider" />
<p class="author">{author_prefix}<a class="author">{author}</a></p>
<p class="genres">{genres}</p>
<p class="formats">{formats}</p>
<table width="100%" border="0">
<tr>
<td class="thumbnail" rowspan="7">{thumb}</td>
<td class="empty"></td>
</tr>
<tr>
<td class="empty"></td>
</tr>
<tr>
<td class="publisher">{publisher}</td>
</tr>
<tr>
<td class="date">{pubyear}</td>
</tr>
<tr>
<td class="rating">{rating}</td>
</tr>
<tr>
<td class="notes">{note_source}: {note_content}</td>
</tr>
<tr>
<td></td>
</tr>
</table>
<hr class="description_divider" />
<div class="description">{comments}</div>
</body>
</html>

View File

@ -55,6 +55,27 @@ author_sort_copy_method = 'invert'
# categories_use_field_for_author_name = 'author_sort' # categories_use_field_for_author_name = 'author_sort'
categories_use_field_for_author_name = 'author' categories_use_field_for_author_name = 'author'
# When partitioning the tags browser, the format of the subcategory label is
# controlled by a template: categories_collapsed_name_template if sorting by
# name, categories_collapsed_rating_template if sorting by average rating, and
# categories_collapsed_popularity_template if sorting by popularity. There are
# two variables available to the template: first and last. The variable 'first'
# is the initial item in the subcategory, and the variable 'last' is the final
# item in the subcategory. Both variables are 'objects'; they each have multiple
# values that are obtained by using a suffix. For example, first.name for an
# author category will be the name of the author. The sub-values available are:
# name: the printable name of the item
# count: the number of books that references this item
# avg_rating: the averate rating of all the books referencing this item
# sort: the sort value. For authors, this is the author_sort for that author
# category: the category (e.g., authors, series) that the item is in.
# Note that the "r'" in front of the { is necessary if there are backslashes
# (\ characters) in the template. It doesn't hurt anything to leave it there
# even if there aren't any backslashes.
categories_collapsed_name_template = r'{first.sort:shorten(4,'',0)} - {last.sort:shorten(4,'',0)}'
categories_collapsed_rating_template = r'{first.avg_rating:4.2f:ifempty(0)} - {last.avg_rating:4.2f:ifempty(0)}'
categories_collapsed_popularity_template = r'{first.count:d} - {last.count:d}'
# Set whether boolean custom columns are two- or three-valued. # Set whether boolean custom columns are two- or three-valued.
# Two-values for true booleans # Two-values for true booleans
@ -289,3 +310,11 @@ locale_for_sorting = ''
# metadata one book at a time. If True, then the fields are laid out using two # metadata one book at a time. If True, then the fields are laid out using two
# columns. If False, one column is used. # columns. If False, one column is used.
metadata_single_use_2_cols_for_custom_fields = True metadata_single_use_2_cols_for_custom_fields = True
# The number of seconds to wait before sending emails when using a
# public email server like gmail or hotmail. Default is: 5 minutes
# Setting it to lower may cause the server's SPAM controls to kick in,
# making email sending fail. Changes will take effect only after a restart of
# calibre.
public_smtp_relay_delay = 301

Binary file not shown.

After

Width:  |  Height:  |  Size: 8.8 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 4.1 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.3 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 533 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 3.0 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 16 KiB

View File

@ -0,0 +1,86 @@
__license__ = 'GPL v3'
__copyright__ = '2011, Darko Miletic <darko.miletic at gmail.com>'
'''
www.arabianbusiness.com
'''
from calibre.web.feeds.news import BasicNewsRecipe
class Arabian_Business(BasicNewsRecipe):
title = 'Arabian Business'
__author__ = 'Darko Miletic'
description = 'Comprehensive Guide to Middle East Business & Gulf Industry News including,Banking & Finance,Construction,Energy,Media & Marketing,Real Estate,Transportation,Travel,Technology,Politics,Healthcare,Lifestyle,Jobs & UAE guide.Top Gulf & Dubai Business News.'
publisher = 'Arabian Business Publishing Ltd.'
category = 'ArabianBusiness.com,Arab Business News,Middle East Business News,Middle East Business,Arab Media News,Industry Events,Middle East Industry News,Arab Business Industry,Dubai Business News,Financial News,UAE Business News,Middle East Press Releases,Gulf News,Arab News,GCC Business News,Banking Finance,Media Marketing,Construction,Oil Gas,Retail,Transportation,Travel Hospitality,Photos,Videos,Life Style,Fashion,United Arab Emirates,UAE,Dubai,Sharjah,Abu Dhabi,Qatar,KSA,Saudi Arabia,Bahrain,Kuwait,Oman,Europe,South Asia,America,Asia,news'
oldest_article = 2
max_articles_per_feed = 200
no_stylesheets = True
encoding = 'utf8'
use_embedded_content = False
language = 'en'
remove_empty_feeds = True
publication_type = 'newsportal'
masthead_url = 'http://www.arabianbusiness.com/skins/ab.main/gfx/arabianbusiness_logo_sm.gif'
extra_css = """
body{font-family: Georgia,serif }
img{margin-bottom: 0.4em; margin-top: 0.4em; display:block}
.byline,.dateline{font-size: small; display: inline; font-weight: bold}
ul{list-style: none outside none;}
"""
conversion_options = {
'comment' : description
, 'tags' : category
, 'publisher' : publisher
, 'language' : language
}
remove_tags_before=dict(attrs={'id':'article-title'})
remove_tags = [
dict(name=['meta','link','base','iframe','embed','object'])
,dict(attrs={'class':'printfooter'})
]
remove_attributes=['lang']
feeds = [
(u'Africa' , u'http://www.arabianbusiness.com/world/Africa/?service=rss' )
,(u'Americas' , u'http://www.arabianbusiness.com/world/americas/?service=rss' )
,(u'Asia Pacific' , u'http://www.arabianbusiness.com/world/asia-pacific/?service=rss' )
,(u'Europe' , u'http://www.arabianbusiness.com/world/europe/?service=rss' )
,(u'Middle East' , u'http://www.arabianbusiness.com/world/middle-east/?service=rss' )
,(u'South Asia' , u'http://www.arabianbusiness.com/world/south-asia/?service=rss' )
,(u'Banking & Finance', u'http://www.arabianbusiness.com/industries/banking-finance/?service=rss' )
,(u'Construction' , u'http://www.arabianbusiness.com/industries/construction/?service=rss' )
,(u'Education' , u'http://www.arabianbusiness.com/industries/education/?service=rss' )
,(u'Energy' , u'http://www.arabianbusiness.com/industries/energy/?service=rss' )
,(u'Healthcare' , u'http://www.arabianbusiness.com/industries/healthcare/?service=rss' )
,(u'Media' , u'http://www.arabianbusiness.com/industries/media/?service=rss' )
,(u'Real Estate' , u'http://www.arabianbusiness.com/industries/real-estate/?service=rss' )
,(u'Retail' , u'http://www.arabianbusiness.com/industries/retail/?service=rss' )
,(u'Technology' , u'http://www.arabianbusiness.com/industries/technology/?service=rss' )
,(u'Transport' , u'http://www.arabianbusiness.com/industries/transport/?service=rss' )
,(u'Travel' , u'http://www.arabianbusiness.com/industries/travel-hospitality/?service=rss')
,(u'Equities' , u'http://www.arabianbusiness.com/markets/equities/?service=rss' )
,(u'Commodities' , u'http://www.arabianbusiness.com/markets/commodities/?service=rss' )
,(u'Currencies' , u'http://www.arabianbusiness.com/markets/currencies/?service=rss' )
,(u'Market Data' , u'http://www.arabianbusiness.com/markets/market-data/?service=rss' )
,(u'Comment' , u'http://www.arabianbusiness.com/opinion/comment/?service=rss' )
,(u'Think Tank' , u'http://www.arabianbusiness.com/opinion/think-tank/?service=rss' )
,(u'Arts' , u'http://www.arabianbusiness.com/lifestyle/arts/?service=rss' )
,(u'Cars' , u'http://www.arabianbusiness.com/lifestyle/cars/?service=rss' )
,(u'Food' , u'http://www.arabianbusiness.com/lifestyle/food/?service=rss' )
,(u'Sport' , u'http://www.arabianbusiness.com/lifestyle/sport/?service=rss' )
]
def print_version(self, url):
return url + '?service=printer&page='
def preprocess_html(self, soup):
for item in soup.findAll(style=True):
del item['style']
for alink in soup.findAll('a'):
if alink.string is not None:
tstr = alink.string
alink.replaceWith(tstr)
return soup

View File

@ -5,7 +5,7 @@ __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
''' '''
theatlantic.com theatlantic.com
''' '''
import string, re import re
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import Tag, NavigableString from calibre.ebooks.BeautifulSoup import Tag, NavigableString
@ -33,25 +33,27 @@ class TheAtlantic(BasicNewsRecipe):
articles = [] articles = []
soup = self.index_to_soup(self.INDEX) soup = self.index_to_soup(self.INDEX)
sectit = soup.find('h1', attrs={'class':'sectionTitle'}) ts = soup.find(id='magazineTopStories')
if sectit is not None: ds = self.tag_to_string(ts.find('h1')).split(':')[-1]
texts = self.tag_to_string(sectit).strip().split()[-2:] self.timefmt = ' [%s]'%ds
if texts:
self.timefmt = ' [%s]'%(' '.join(texts))
cover = soup.find('img', src=True, attrs={'class':'cover'}) cover = soup.find('img', src=True, attrs={'class':'cover'})
if cover is not None: if cover is not None:
self.cover_url = cover['src'] self.cover_url = cover['src']
feeds = [] feeds = []
seen_titles = set([])
for section in soup.findAll('div', attrs={'class':'magazineSection'}): for section in soup.findAll('div', attrs={'class':'magazineSection'}):
section_title = section.find(attrs={'class':'sectionHeader'}) section_title = self.tag_to_string(section.find('h2'))
section_title = string.capwords(self.tag_to_string(section_title))
self.log('Found section:', section_title) self.log('Found section:', section_title)
articles = [] articles = []
for post in section.findAll('div', attrs={'class':'post'}): for post in section.findAll('div', attrs={'class':lambda x : x and
'post' in x}):
h = post.find(['h3', 'h4']) h = post.find(['h3', 'h4'])
title = self.tag_to_string(h) title = self.tag_to_string(h)
if title in seen_titles:
continue
seen_titles.add(title)
a = post.find('a', href=True) a = post.find('a', href=True)
url = a['href'] url = a['href']
if url.startswith('/'): if url.startswith('/'):
@ -64,36 +66,23 @@ class TheAtlantic(BasicNewsRecipe):
self.log('\t\t', desc) self.log('\t\t', desc)
articles.append({'title':title, 'url':url, 'description':desc, articles.append({'title':title, 'url':url, 'description':desc,
'date':''}) 'date':''})
feeds.append((section_title, articles)) if articles:
feeds.append((section_title, articles))
poems = [] poems = []
self.log('Found section: Poems') self.log('Found section: Poems')
for poem in soup.findAll('div', attrs={'class':'poem'}): pd = soup.find('h2', text='Poetry').parent.parent
title = self.tag_to_string(poem.find('h4')) for poem in pd.findAll('h4'):
desc = self.tag_to_string(poem.find(attrs={'class':'author'})) title = self.tag_to_string(poem)
url = poem.find('a')['href'] url = poem.find('a')['href']
if url.startswith('/'): if url.startswith('/'):
url = 'http://www.theatlantic.com' + url url = 'http://www.theatlantic.com' + url
self.log('\tFound article:', title, 'at', url) self.log('\tFound article:', title, 'at', url)
self.log('\t\t', desc) poems.append({'title':title, 'url':url, 'description':'',
poems.append({'title':title, 'url':url, 'description':desc,
'date':''}) 'date':''})
if poems: if poems:
feeds.append(('Poems', poems)) feeds.append(('Poems', poems))
div = soup.find(id='advice')
if div is not None:
self.log('Found section: Advice')
title = self.tag_to_string(div.find('h4'))
url = div.find('a')['href']
if url.startswith('/'):
url = 'http://www.theatlantic.com' + url
desc = self.tag_to_string(div.find('p'))
self.log('\tFound article:', title, 'at', url)
self.log('\t\t', desc)
feeds.append(('Advice', [{'title':title, 'url':url, 'description':desc,
'date':''}]))
return feeds return feeds
def postprocess_html(self, soup, first): def postprocess_html(self, soup, first):

View File

@ -0,0 +1,186 @@
from __future__ import with_statement
__license__ = 'GPL 3'
__copyright__ = 'Original 2009, Kovid Goyal <kovid@kovidgoyal.net>'
__copyright__= 'Modified 2011, Josh Hall <jwtheiv@gmail.com>'
__docformat__ = 'restructuredtext en'
'''
www.baltimoresun.com
'''
from calibre.web.feeds.news import BasicNewsRecipe
class BaltimoreSun(BasicNewsRecipe):
title = 'The Baltimore Sun'
__author__ = 'Josh Hall'
description = 'Politics, local and business news from Baltimore'
language = 'en'
oldest_article = 1
max_articles_per_feed = 100
remove_empty_feeds = True
use_embedded_content = False
no_stylesheets = True
remove_javascript = True
#masthead_url = 'http://www.baltimoresun.com/images/thirdpartylogo.gif'
remove_tags_before = dict(name='div', attrs={'class':['story', 'entry']})
remove_tags_after = [
{'class':['photo_article',]},
dict(name='div', attrs={'class':'shirttail-promo right clearfix'}),
]
keep_only_tags = [dict(name='div', attrs={'class':["story","entry-asset asset hentry"]}),
dict(name='div', attrs={'id':["pagebody","story","maincontentcontainer"]}),
]
remove_tags = [{'id':["moduleArticleTools","content-bottom","rail","articleRelates module","toolSet","relatedrailcontent","div-wrapper","beta","atp-comments","footer","article-promo"]},
{'class':["entry-footer-left","entry-footer-right","shirttail-promo right clearfix","clearfix","relatedTitle","articleRelates module","asset-footer","tools","comments","featurePromo","featurePromo fp-topjobs brownBackground","clearfix fullSpan brownBackground","curvedContent","toppaginate","module","module-header","module-content"]},
dict(name='font',attrs={'id':["cr-other-headlines"]}),
dict(name=['iframe']),
]
extra_css = '''
h1{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:large;}
h2{font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:small;}
.byline {font-family:Arial,Helvetica,sans-serif; font-size:xx-small;}
.date {font-family:Arial,Helvetica,sans-serif; font-size:xx-small;}
p{font-family:Arial,Helvetica,sans-serif;font-size:small;}
.copyright {font-family:Arial,Helvetica,sans-serif;font-size:xx-small;text-align:center}
.story{font-family:Arial,Helvetica,sans-serif;font-size:small;}
.entry-asset asset hentry{font-family:Arial,Helvetica,sans-serif;font-size:small;}
.pagebody{font-family:Arial,Helvetica,sans-serif;font-size:small;}
.maincontentcontainer{font-family:Arial,Helvetica,sans-serif;font-size:small;}
.story-body{font-family:Arial,Helvetica,sans-serif;font-size:small;}
body{font-family:Helvetica,Arial,sans-serif;font-size:small;}
'''
feeds = [
(u'Top Headlines', u'http://www.baltimoresun.com/rss2.0.xml'),
(u'Breaking News', u'http://www.baltimoresun.com/news/breaking/rss2.0.xml'),
(u'Top Maryland', u'http://www.baltimoresun.com/news/maryland/rss2.0.xml'),
#(u'Anne Arundel County', u'http://www.baltimoresun.com/news/maryland/anne-arundel/rss2.0.xml'),
(u'Baltimore City', u'http://www.baltimoresun.com/news/maryland/baltimore-city/rss2.0.xml'),
#(u'Baltimore County', u'http://www.baltimoresun.com/news/maryland/baltimore-county/rss2.0.xml'),
#(u'Carroll County', u'http://www.baltimoresun.com/news/maryland/carroll/rss2.0.xml'),
#(u'Harford County', u'http://www.baltimoresun.com/news/maryland/harford/rss2.0.xml'),
#(u'Howard County', u'http://www.baltimoresun.com/news/maryland/howard/rss2.0.xml'),
(u'Education', u'http://www.baltimoresun.com/news/education/rss2.0.xml'),
#(u'Obituaries', u'http://www.baltimoresun.com/news/obituaries/rss2.0.xml'),
(u'Local Politics', u'http://www.baltimoresun.com/news/maryland/politics/rss2.0.xml'),
(u'Weather', u'http://www.baltimoresun.com/news/weather/rss2.0.xml'),
#(u'Traffic', u'http://www.baltimoresun.com/features/commuting/rss2.0.xml'),
(u'Nation/world', u'http://feeds.chicagotribune.com/chicagotribune/news/nationworld/'),
(u'Weird News', u'http://www.baltimoresun.com/news/offbeat/rss2.0.xml'),
(u'Top Sports', u'http://www.baltimoresun.com/sports/rss2.0.xml'),
(u'Orioles/Baseball', u'http://www.baltimoresun.com/sports/orioles/rss2.0.xml'),
(u'Ravens/Football', u'http://www.baltimoresun.com/sports/ravens/rss2.0.xml'),
#(u'Terps', u'http://www.baltimoresun.com/sports/terps/rss2.0.xml'),
#(u'College Football', u'http://www.baltimoresun.com/sports/college/football/rss2.0.xml'),
#(u'Lacrosse', u'http://www.baltimoresun.com/sports/college/lacrosse/rss2.0.xml'),
#(u'Horse Racing', u'http://www.baltimoresun.com/sports/horse-racing/rss2.0.xml'),
#(u'Golf', u'http://www.baltimoresun.com/sports/golf/rss2.0.xml'),
#(u'NBA', u'http://www.baltimoresun.com/sports/nba/rss2.0.xml'),
#(u'High School', u'http://www.baltimoresun.com/sports/high-school/rss2.0.xml'),
#(u'Outdoors', u'http://www.baltimoresun.com/sports/outdoors/rss2.0.xml'),
(u'Celebrity News', u'http://www.baltimoresun.com/entertainment/celebrities/rss2.0.xml'),
(u'Arts & Theater', u'http://www.baltimoresun.com/entertainment/arts/rss2.0.xml'),
(u'Movies', u'http://www.baltimoresun.com/entertainment/movies/rss2.0.xml'),
(u'Music & Nightlife', u'http://www.baltimoresun.com/entertainment/music/rss2.0.xml'),
(u'Restaurants & Food', u'http://www.baltimoresun.com/entertainment/dining/rss2.0.xml'),
(u'TV/Media', u'http://www.baltimoresun.com/entertainment/tv/rss2.0.xml'),
(u'Health&Wellness', u'http://www.baltimoresun.com/health/rss2.0.xml'),
(u'Home & Garden', u'http://www.baltimoresun.com/features/home-garden/rss2.0.xml'),
(u'Living Green', u'http://www.baltimoresun.com/features/green/rss2.0.xml'),
(u'Parenting', u'http://www.baltimoresun.com/features/parenting/rss2.0.xml'),
(u'Fashion', u'http://www.baltimoresun.com/features/fashion/rss2.0.xml'),
(u'Travel', u'http://www.baltimoresun.com/travel/rss2.0.xml'),
(u'Faith', u'http://www.baltimoresun.com/features/faith/rss2.0.xml'),
(u'Top Business', u'http://www.baltimoresun.com/business/rss2.0.xml'),
(u'Technology', u'http://www.baltimoresun.com/business/technology/rss2.0.xml'),
(u'Personal finance', u'http://www.baltimoresun.com/business/money/rss2.0.xml'),
(u'Real Estate', u'http://www.baltimoresun.com/classified/realestate/rss2.0.xml'),
(u'Jobs', u'http://www.baltimoresun.com/classified/jobs/rss2.0.xml'),
(u'DIY', u'http://www.baltimoresun.com/features/do-it-yourself/rss2.0.xml'),
(u'Consumer Safety', u'http://www.baltimoresun.com/business/consumer-safety/rss2.0.xml'),
(u'Investing', u'http://www.baltimoresun.com/business/money/rss2.0.xml'),
(u'Sun Editorials', u'http://www.baltimoresun.com/news/opinion/editorial/rss2.0.xml'),
(u'Op/Ed', u'http://www.baltimoresun.com/news/opinion/oped/rss2.0.xml'),
(u'Readers Respond', u'http://www.baltimoresun.com/news/opinion/readersrespond/'),
(u'Kevin Cowherd', 'http://www.baltimoresun.com/sports/bal-columnist-cowherd,0,6829726.columnist-rss2.0.xml'),
(u'Jay Hancock', u'http://www.baltimoresun.com/business/money/bal-columnist-hancock,0,6673611.columnist-rss2.0.xml'),
(u'Jacques Kelly', u'http://www.baltimoresun.com/news/maryland/bal-columnist-kelly,0,1154701.columnist-rss2.0.xml'),
(u'Marta H. Mossburg', u'http://www.baltimoresun.com/news/opinion/oped/bal-columnist-mossburg,0,7982155.columnist-rss2.0.xml'),
(u'Mike Preston', u'http://www.baltimoresun.com/sports/bal-columnist-preston,0,6169796.columnist-rss2.0.xml'),
(u'Susan Reimer', u'http://www.baltimoresun.com/news/opinion/bal-columnist-reimer,0,162466.columnist-rss2.0.xml'),
(u'Dan Rodricks', u'http://www.baltimoresun.com/news/maryland/bal-columnist-rodricks,0,7089843.columnist-rss2.0.xml'),
(u'Thomas F. Schaller', u'http://www.baltimoresun.com/news/opinion/columnists/bal-columnist-schaller,0,897397.columnist-rss2.0.xml'),
(u'Peter Schmuck', u'http://www.baltimoresun.com/sports/bal-columnist-schmuck,0,7485088.columnist-rss2.0.xml'),
(u'Ron Smith', u'http://www.baltimoresun.com/news/opinion/bal-columnist-ronsmith,0,3964803.columnist-rss2.0.xml'),
(u'Baltimore Crime Beat', u'http://weblogs.baltimoresun.com/news/crime/blog/index.xml'),
(u'Getting There', u'http://weblogs.baltimoresun.com/news/traffic/index.xml'),
(u'InsideEd', u'http://weblogs.baltimoresun.com/news/education/blog/index.xml'),
(u'Maryland Politics', u'http://weblogs.baltimoresun.com/news/local/politics/index.xml'),
(u'Maryland Weather', u'http://weblogs.marylandweather.com/index.xml'),
(u'Second Opinion', u'http://weblogs.baltimoresun.com/news/opinion/index.xml'),
(u'You Dont Say', u'http://weblogs.baltimoresun.com/news/mcintyre/blog/index.xml'),
(u'BaltTech', u'http://weblogs.baltimoresun.com/news/technology/index.xml'),
(u'Consuming Interests', u'http://weblogs.baltimoresun.com/business/consuminginterests/blog/index.xml'),
(u'Jay Hancocks Blog', u'http://weblogs.baltimoresun.com/business/hancock/blog/index.xml'),
(u'The Real Estate Wonk', u'http://weblogs.baltimoresun.com/business/realestate/blog/index.xml'),
(u'Clef Notes', 'http://weblogs.baltimoresun.com/entertainment/classicalmusic/index.xml'),
(u'Dining at Large', u'http://weblogs.baltimoresun.com/entertainment/dining/reviews/blog/index.xml'),
(u'Midnight Sun', u'http://weblogs.baltimoresun.com/entertainment/midnight_sun/blog/index.xml'),
(u'Mike Sragow Gets Reel', u'http://weblogs.baltimoresun.com/entertainment/movies/blog/index.xml'),
(u'Read Street', u'http://weblogs.baltimoresun.com/entertainment/books/blog/index.xml'),
(u'Reality Check', u'http://weblogs.baltimoresun.com/entertainment/realitycheck/blog/index.xml'),
(u'Z on TV', u'http://weblogs.baltimoresun.com/entertainment/zontv/index.xml'),
(u'BMore Green', u'http://weblogs.baltimoresun.com/features/green/index.xml'),
(u'Charm City Moms', u'http://weblogs.baltimoresun.com/features/baltimoremomblog/index.xml'),
(u'Exercists', u'http://weblogs.baltimoresun.com/health/fitness/index.xml'),
(u'Garden Variety', 'http://weblogs.baltimoresun.com/features/gardening/index.xml'),
#(u'In Good Faith', u'http://weblogs.baltimoresun.com/news/faith/index.xml'),
(u'Picture of Health', u'http://weblogs.baltimoresun.com/health/index.xml'),
(u'Unleashed', u'http://weblogs.baltimoresun.com/features/mutts/blog/index.xml'),
#(u'Faceoff', u'http://weblogs.baltimoresun.com/sports/lacrosse/blog/index.xml'),
#(u'MMA Stomping Grounds', u'http://weblogs.baltimoresun.com/sports/mma/blog/index.xml'),
(u'Orioles Insider', u'http://weblogs.baltimoresun.com/sports/orioles/blog/index.xml'),
#(u'Outdoors Girl', u'http://weblogs.baltimoresun.com/sports/outdoors/blog/index.xml'),
(u'Ravens Insider', u'http://weblogs.baltimoresun.com/sports/ravens/blog/index.xml'),
#(u'Recruiting Report', u'http://weblogs.baltimoresun.com/sports/college/recruiting/index.xml'),
#(u'Ring Posts', u'http://weblogs.baltimoresun.com/sports/wrestling/blog/index.xml'),
(u'The Schmuck Stops Here', u'http://weblogs.baltimoresun.com/sports/schmuck/index.xml'),
(u'Toy Department', u'http://weblogs.baltimoresun.com/sports/thetoydepartment/index.xml'),
#(u'Tracking the Terps', u'http://weblogs.baltimoresun.com/sports/college/maryland_terps/blog/index.xml'),
#(u'Varsity Letters', u'http://weblogs.baltimoresun.com/sports/highschool/varsityletters/index.xml'),
(u'Virtual Vensanity', u'http://weblogs.baltimoresun.com/entertainment/bthesite/vensel/index.xml'),
]
def get_article_url(self, article):
print article.get('feedburner_origlink', article.get('guid', article.get('link')))
return article.get('feedburner_origlink', article.get('guid', article.get('link')))
def postprocess_html(self, soup, first_fetch):
for t in soup.findAll(['table', 'tr', 'td']):
t.name = 'div'
for tag in soup.findAll('form', dict(attrs={'name':["comments_form"]})):
tag.extract()
for tag in soup.findAll('font', dict(attrs={'id':["cr-other-headlines"]})):
tag.extract()
return soup

View File

@ -1,19 +1,16 @@
#!/usr/bin/env python #!/usr/bin/env python
# -*- coding: utf-8 mode: python -*- # -*- coding: utf-8 mode: python -*-
# Find the newest version of this recipe here:
# https://github.com/consti/BrandEins-Recipe/raw/master/brandeins.recipe
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2010, Constantin Hofstetter <consti at consti.de>, Steffen Siebert <calibre at steffensiebert.de>' __copyright__ = '2010, Constantin Hofstetter <consti at consti.de>, Steffen Siebert <calibre at steffensiebert.de>'
__version__ = '0.96' __version__ = '0.97'
''' http://brandeins.de - Wirtschaftsmagazin ''' ''' http://brandeins.de - Wirtschaftsmagazin '''
import re import re
import string import string
from calibre.ebooks.BeautifulSoup import Tag
from calibre.web.feeds.recipes import BasicNewsRecipe from calibre.web.feeds.recipes import BasicNewsRecipe
class BrandEins(BasicNewsRecipe): class BrandEins(BasicNewsRecipe):
title = u'brand eins' title = u'brand eins'
@ -28,6 +25,8 @@ class BrandEins(BasicNewsRecipe):
language = 'de' language = 'de'
publication_type = 'magazine' publication_type = 'magazine'
needs_subscription = 'optional' needs_subscription = 'optional'
# Prevent that conversion date is appended to title
timefmt = ''
# 2 is the last full magazine (default) # 2 is the last full magazine (default)
# 1 is the newest (but not full) # 1 is the newest (but not full)
@ -66,6 +65,13 @@ class BrandEins(BasicNewsRecipe):
new_p = "<p><i>"+ content +"</i></p>" new_p = "<p><i>"+ content +"</i></p>"
p.replaceWith(new_p) p.replaceWith(new_p)
# Change <h3> to <h1>
header = soup.find("h3")
if header:
tag = Tag(soup, "h1")
tag.insert(0, header.contents[0])
header.replaceWith(tag)
return soup return soup
def get_cover(self, soup): def get_cover(self, soup):
@ -77,6 +83,7 @@ class BrandEins(BasicNewsRecipe):
def parse_index(self): def parse_index(self):
feeds = [] feeds = []
issue_map = {}
archive = "http://www.brandeins.de/archiv.html" archive = "http://www.brandeins.de/archiv.html"
@ -88,21 +95,31 @@ class BrandEins(BasicNewsRecipe):
pass pass
soup = self.index_to_soup(archive) soup = self.index_to_soup(archive)
latest_jahrgang = soup.findAll('div', attrs={'class': re.compile(r'\bjahrgang-latest\b') })[0].findAll('ul')[0] issue_list = soup.findAll('div', attrs={'class': 'tx-brandeinsmagazine-pi1'})[0].findAll('a')
pre_latest_issue = latest_jahrgang.findAll('a')[len(latest_jahrgang.findAll('a'))-issue] issue_list = [i for i in issue_list if i.get('onmouseover', False)]
url = pre_latest_issue.get('href', False) for i in issue_list:
# Get month and year of the magazine issue - build it out of the title of the cover issue_number_string = i.get('onmouseover', False)
self.timefmt = " " + re.search(r"(?P<date>\d\d\/\d\d\d\d)", pre_latest_issue.find('img').get('title', False)).group('date') if issue_number_string:
match = re.match("^switch_magazine\(([0-9]+), ([0-9]+)\)$", issue_number_string)
issue_number = "%04i%02i" % (int(match.group(1)), int(match.group(2)))
issue_map[issue_number] = i
keys = issue_map.keys()
keys.sort()
keys.reverse()
selected_issue = issue_map[keys[issue-1]]
url = selected_issue.get('href', False)
# Get the title for the magazin - build it out of the title of the cover - take the issue and year;
self.title = "brand eins "+ re.search(r"(?P<date>\d\d\/\d\d\d\d)", selected_issue.find('img').get('title', False)).group('date')
url = 'http://brandeins.de/'+url url = 'http://brandeins.de/'+url
# url = "http://www.brandeins.de/archiv/magazin/tierisch.html" # url = "http://www.brandeins.de/archiv/magazin/tierisch.html"
titles_and_articles = self.brand_eins_parse_latest_issue(url) titles_and_articles = self.brand_eins_parse_issue(url)
if titles_and_articles: if titles_and_articles:
for title, articles in titles_and_articles: for title, articles in titles_and_articles:
feeds.append((title, articles)) feeds.append((title, articles))
return feeds return feeds
def brand_eins_parse_latest_issue(self, url): def brand_eins_parse_issue(self, url):
soup = self.index_to_soup(url) soup = self.index_to_soup(url)
self.cover_url = self.get_cover(soup) self.cover_url = self.get_cover(soup)
article_lists = [soup.find('div', attrs={'class':'subColumnLeft articleList'}), soup.find('div', attrs={'class':'subColumnRight articleList'})] article_lists = [soup.find('div', attrs={'class':'subColumnLeft articleList'}), soup.find('div', attrs={'class':'subColumnRight articleList'})]
@ -145,4 +162,3 @@ class BrandEins(BasicNewsRecipe):
current_articles.append({'title': title, 'url': url, 'description': description, 'date':''}) current_articles.append({'title': title, 'url': url, 'description': description, 'date':''})
titles_and_articles.append([chapter_title, current_articles]) titles_and_articles.append([chapter_title, current_articles])
return titles_and_articles return titles_and_articles

View File

@ -1,7 +1,5 @@
#!/usr/bin/env python
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>' __copyright__ = '2009-2010, Darko Miletic <darko.miletic at gmail.com>'
''' '''
www.businessworld.in www.businessworld.in
''' '''
@ -22,7 +20,11 @@ class BusinessWorldMagazine(BasicNewsRecipe):
use_embedded_content = False use_embedded_content = False
encoding = 'utf-8' encoding = 'utf-8'
language = 'en_IN' language = 'en_IN'
extra_css = """
img{display: block; margin-bottom: 0.5em}
body{font-family: Arial,Helvetica,sans-serif}
h2{color: gray; display: block}
"""
conversion_options = { conversion_options = {
'comment' : description 'comment' : description
@ -43,6 +45,25 @@ class BusinessWorldMagazine(BasicNewsRecipe):
linklist = [] linklist = []
soup = self.index_to_soup(self.INDEX) soup = self.index_to_soup(self.INDEX)
tough = soup.find('div', attrs={'id':'tough'})
if tough:
for item in tough.findAll('h1'):
description = ''
title_prefix = ''
feed_link = item.find('a')
if feed_link and feed_link.has_key('href'):
url = self.ROOT + feed_link['href']
if not self.is_in_list(linklist,url):
title = title_prefix + self.tag_to_string(feed_link)
date = strftime(self.timefmt)
articles.append({
'title' :title
,'date' :date
,'url' :url
,'description':description
})
linklist.append(url)
for item in soup.findAll('div', attrs={'class':'nametitle'}): for item in soup.findAll('div', attrs={'class':'nametitle'}):
description = '' description = ''
title_prefix = '' title_prefix = ''
@ -62,8 +83,8 @@ class BusinessWorldMagazine(BasicNewsRecipe):
return [(soup.head.title.string, articles)] return [(soup.head.title.string, articles)]
keep_only_tags = [dict(name='div', attrs={'id':['register-panel','printwrapper']})] keep_only_tags = [dict(name='div', attrs={'id':'printwrapper'})]
remove_tags = [dict(name=['object','link'])] remove_tags = [dict(name=['object','link','meta','base','iframe','link','table'])]
def print_version(self, url): def print_version(self, url):
return url.replace('/bw/','/bw/storyContent/') return url.replace('/bw/','/bw/storyContent/')

View File

@ -0,0 +1,35 @@
from calibre.web.feeds.news import BasicNewsRecipe
class Cicero(BasicNewsRecipe):
timefmt = ' [%Y-%m-%d]'
title = u'Cicero'
__author__ = 'mad@sharktooth.de'
description = u'Magazin f\xfcr politische Kultur'
oldest_article = 7
language = 'de'
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
publisher = 'Ringier Publishing'
category = 'news, politics, Germany'
encoding = 'iso-8859-1'
publication_type = 'magazine'
masthead_url = 'http://www.cicero.de/img2/cicero_logo_rss.gif'
feeds = [
(u'Das gesamte Portfolio', u'http://www.cicero.de/rss/rss.php?ress_id='),
#(u'Alle Heft-Inhalte', u'http://www.cicero.de/rss/rss.php?ress_id=heft'),
#(u'Alle Online-Inhalte', u'http://www.cicero.de/rss/rss.php?ress_id=online'),
#(u'Berliner Republik', u'http://www.cicero.de/rss/rss.php?ress_id=4'),
#(u'Weltb\xfchne', u'http://www.cicero.de/rss/rss.php?ress_id=1'),
#(u'Salon', u'http://www.cicero.de/rss/rss.php?ress_id=7'),
#(u'Kapital', u'http://www.cicero.de/rss/rss.php?ress_id=6'),
#(u'Netzst\xfccke', u'http://www.cicero.de/rss/rss.php?ress_id=9'),
#(u'Leinwand', u'http://www.cicero.de/rss/rss.php?ress_id=12'),
#(u'Bibliothek', u'http://www.cicero.de/rss/rss.php?ress_id=15'),
(u'Kolumne - Alle Kolulmnen', u'http://www.cicero.de/rss/rss2.php?ress_id='),
#(u'Kolumne - Schreiber, Berlin', u'http://www.cicero.de/rss/rss2.php?ress_id=35'),
#(u'Kolumne - TV Kritik', u'http://www.cicero.de/rss/rss2.php?ress_id=34')
]
def print_version(self, url):
return 'http://www.cicero.de/page_print.php?' + url.rpartition('?')[2]

View File

@ -11,7 +11,7 @@ class CNetJapan(BasicNewsRecipe):
(u'CNet Blog', u'http://feed.japan.cnet.com/rss/blog/index.rdf') (u'CNet Blog', u'http://feed.japan.cnet.com/rss/blog/index.rdf')
] ]
language = 'ja' language = 'ja'
encoding = 'Shift_JIS' encoding = 'utf-8'
remove_javascript = True remove_javascript = True
preprocess_regexps = [ preprocess_regexps = [

View File

@ -1,5 +1,5 @@
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2008-2010, Darko Miletic <darko.miletic at gmail.com>' __copyright__ = '2008-2011, Darko Miletic <darko.miletic at gmail.com>'
''' '''
danas.rs danas.rs
''' '''
@ -33,6 +33,7 @@ class Danas(BasicNewsRecipe):
margin-bottom: 0; margin-bottom: 0;
margin-top: 0} margin-top: 0}
h2,.datum,.lokacija,.autor{font-size: small} h2,.datum,.lokacija,.autor{font-size: small}
.autor{text-transform: uppercase}
.antrfileNaslov{border-left: 2px solid #999999; .antrfileNaslov{border-left: 2px solid #999999;
margin-left: 0.8em; margin-left: 0.8em;
padding-left: 1.2em; padding-left: 1.2em;
@ -40,6 +41,7 @@ class Danas(BasicNewsRecipe):
margin-bottom: 0; margin-bottom: 0;
margin-top: 0} margin-top: 0}
img{margin-bottom: 0.8em} img{margin-bottom: 0.8em}
.naslovTemeDana{font-size: small}
""" """
conversion_options = { conversion_options = {
@ -62,6 +64,7 @@ class Danas(BasicNewsRecipe):
,(re.compile(u'\u00f4'), lambda match: '&ldquo;') # latin small letter o with circumflex ,(re.compile(u'\u00f4'), lambda match: '&ldquo;') # latin small letter o with circumflex
,(re.compile(u'\u00f6'), lambda match: '&rdquo;') # latin small letter o with dieaeresis ,(re.compile(u'\u00f6'), lambda match: '&rdquo;') # latin small letter o with dieaeresis
,(re.compile(u'\u00e1'), lambda match: '&nbsp;' ) # latin small letter a with acute ,(re.compile(u'\u00e1'), lambda match: '&nbsp;' ) # latin small letter a with acute
,(re.compile(u'\u00e4'), lambda match: '&nbsp;' ) # latin small letter a with dieaeresis
] ]
keep_only_tags = [dict(name='div', attrs={'id':'left'})] keep_only_tags = [dict(name='div', attrs={'id':'left'})]
@ -124,6 +127,6 @@ class Danas(BasicNewsRecipe):
cover_url = None cover_url = None
soup = self.index_to_soup('http://www.danas.rs/') soup = self.index_to_soup('http://www.danas.rs/')
for citem in soup.findAll('img'): for citem in soup.findAll('img'):
if citem['src'].endswith('naslovna.jpg'): if citem['src'].endswith('naslovna.jpg') or citem['src'].endswith('naslovna1.jpg'):
return 'http://www.danas.rs' + citem['src'] return 'http://www.danas.rs' + citem['src']
return cover_url return cover_url

View File

@ -0,0 +1,70 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__author__ = 'Gerardo Diez'
__copyright__ = 'Gerardo Diez<gerardo.diez.garcia@gmail.com>'
description = 'Main daily newspaper from Spain - v1.00 (05, Enero 2011)'
__docformat__ = 'restructuredtext en'
'''
deia.com
'''
from calibre.web.feeds.recipes import BasicNewsRecipe
class Deia(BasicNewsRecipe):
title ='Deia'
__author__ ='Gerardo Diez'
publisher ='Editorial Iparraguirre, S.A'
category ='news, politics, finances, world, spain, euskadi'
publication_type ='newspaper'
oldest_article =1
max_articles_per_feed =100
simultaneous_downloads =10
cover_url ='http://2.bp.blogspot.com/_RjrWzC6tI14/TM6jrPLaBZI/AAAAAAAAFaI/ayffwxidFEY/s1600/2009-10-13-logo-deia.jpg'
timefmt ='[%a, %d %b, %Y]'
encoding ='utf8'
language ='es'
remove_javascript =True
remove_tags_after =dict(id='Texto')
remove_tags_before =dict(id='Texto')
remove_tags =[dict(name='div', attrs={'class':['Herramientas ', 'Multimedia']})]
no_stylesheets =True
extra_css ='h1 {margin-bottom: .15em;font-size: 2.7em; font-family: Georgia, "Times New Roman", Times, serif;} .Antetitulo {margin: 1em 0;text-transform: uppercase;color: #999;} .PieFoto {margin: .1em 0;padding: .5em .5em .5em .5em;background: #F0F0F0;} .PieFoto p {margin-bottom: 0;font-family: Georgia,"Times New Roman",Times,serif;font-weight: bold; font-style: italic; color: #666;}'
keep_only_tags =[dict(name='div', attrs={'class':['Texto ', 'NoticiaFicha ']})]
feeds = [
(u'Bizkaia' ,u'http://www.deia.com/index.php/services/rss?seccion=bizkaia'),
(u'Bilbao' ,u'http://www.deia.com/index.php/services/rss?seccion=bilbao'),
(u'Hemendik eta Handik' ,u'http://www.deia.com/index.php/services/rss?seccion=hemendik-eta-handik'),
(u'Margen Derecha' ,u'http://www.deia.com/index.php/services/rss?seccion=margen-derecha'),
(u'Encartaciones y Margen Izquierda' ,u'http://www.deia.com/index.php/services/rss?seccion=margen-izquierda-encartaciones'),
(u'Costa' ,u'http://www.deia.com/index.php/services/rss?seccion=costa'),
(u'Duranguesado' ,u'http://www.deia.com/index.php/services/rss?seccion=duranguesado'),
(u'Llodio-Nervión' ,u'http://www.deia.com/index.php/services/rss?seccion=llodio-nervion'),
(u'Arratia-Nervión' ,u'http://www.deia.com/index.php/services/rss?seccion=arratia-nervion'),
(u'Uribe-Txorierri' ,u'http://www.deia.com/index.php/services/rss?seccion=uribe-txorierri'),
(u'Ecos de sociedad' ,u'http://www.deia.com/index.php/services/rss?seccion=ecos-de-sociedad'),
(u'Sucesos' ,u'http://www.deia.com/index.php/services/rss?seccion=sucesos'),
(u'Política' ,u'http://www.deia.com/index.php/services/rss?seccion=politica'),
(u'Euskadi' ,u'http://www.deia.com/index.php/services/rss?seccion=politica/euskadi'),
(u'España' ,u'http://www.deia.com/index.php/services/rss?seccion=politica/espana'),
(u'Sociedad',u'http://www.deia.com/index.php/services/rss?seccion=sociedad'),
(u'Euskadi' ,u'http://www.deia.com/index.php/services/rss?seccion=socidad/euskadi'),
(u'Sociedad.España' ,u'http://www.deia.com/index.php/services/rss?seccion=sociedad/espana'),
(u'Ocio y Cultura' ,u'http://www.deia.com/index.php/services/rss?seccion=ocio-y-cultura'),
#(u'Cultura' ,u'http://www.deia.com/index.php/services/rss?seccion=cultura'),
#(u'Ocio' ,u'http://www.deia.com/index.php/services/rss?seccion=ocio'),
(u'On' ,u'http://www.deia.com/index.php/services/rss?seccion=on'),
(u'Agenda' ,u'http://www.deia.com/index.php/services/rss?seccion=agenda'),
(u'Comunicación' ,u'http://www.deia.com/index.php/services/rss?seccion=comunicacion'),
(u'Viajes' ,u'http://www.deia.com/index.php/services/rss?seccion=viajes'),
(u'¡Mundo!' ,u'http://www.deia.com/index.php/services/rss?seccion=que-mundo'),
(u'Humor' ,u'http://www.deia.com/index.php/services/rss?seccion=humor'),
(u'Opinión' ,u'http://www.deia.com/index.php/services/rss?seccion=opinion'),
(u'Editorial' ,u'http://www.deia.com/index.php/services/rss?seccion=editorial'),
(u'Tribuna abierta' ,u'http://www.deia.com/index.php/services/rss?seccion=tribuna-abierta'),
(u'Colaboración' ,u'http://www.deia.com/index.php/services/rss?seccion=colaboracion'),
(u'Columnistas' ,u'http://www.deia.com/index.php/services/rss?seccion=columnistas'),
(u'Deportes' ,u'http://www.deia.com/index.php/services/rss?seccion=deportes'),
(u'Athletic' ,u'http://www.deia.com/index.php/services/rss?seccion=athletic'),
(u'Economía' ,'http://www.deia.com/index.php/services/rss?seccion=economia'),
(u'Mundo' ,u'http://www.deia.com/index.php/services/rss?seccion=mundo')]

View File

@ -9,7 +9,7 @@ from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup from calibre.ebooks.BeautifulSoup import BeautifulSoup
from calibre.ebooks.BeautifulSoup import Tag, NavigableString from calibre.ebooks.BeautifulSoup import Tag, NavigableString
import mechanize, string, urllib, time, re import string, time, re
class Economist(BasicNewsRecipe): class Economist(BasicNewsRecipe):
@ -18,19 +18,19 @@ class Economist(BasicNewsRecipe):
__author__ = "Kovid Goyal" __author__ = "Kovid Goyal"
INDEX = 'http://www.economist.com/printedition' INDEX = 'http://www.economist.com/printedition'
description = ('Global news and current affairs from a European perspective.' description = 'Global news and current affairs from a European perspective.'
' Needs a subscription from ')+INDEX
oldest_article = 7.0 oldest_article = 7.0
cover_url = 'http://www.economist.com/images/covers/currentcoverus_large.jpg' cover_url = 'http://www.economist.com/images/covers/currentcoverus_large.jpg'
remove_tags = [dict(name=['script', 'noscript', 'title', 'iframe', 'cf_floatingcontent']), remove_tags = [dict(name=['script', 'noscript', 'title', 'iframe', 'cf_floatingcontent']),
dict(attrs={'class':['dblClkTrk', 'ec-article-info']})] dict(attrs={'class':['dblClkTrk', 'ec-article-info']})]
keep_only_tags = [dict(id='ec-article-body')] keep_only_tags = [dict(id='ec-article-body')]
needs_subscription = True needs_subscription = False
no_stylesheets = True no_stylesheets = True
preprocess_regexps = [(re.compile('</html>.*', re.DOTALL), preprocess_regexps = [(re.compile('</html>.*', re.DOTALL),
lambda x:'</html>')] lambda x:'</html>')]
'''
def get_browser(self): def get_browser(self):
br = BasicNewsRecipe.get_browser() br = BasicNewsRecipe.get_browser()
br.open('http://www.economist.com') br.open('http://www.economist.com')
@ -50,6 +50,7 @@ class Economist(BasicNewsRecipe):
})) }))
br.open(req).read() br.open(req).read()
return br return br
'''
def parse_index(self): def parse_index(self):
try: try:

View File

@ -7,12 +7,12 @@ from lxml import html
class Economist(BasicNewsRecipe): class Economist(BasicNewsRecipe):
title = 'The Economist (free)' title = 'The Economist (RSS)'
language = 'en' language = 'en'
__author__ = "Kovid Goyal" __author__ = "Kovid Goyal"
description = ('Global news and current affairs from a European perspective.' description = ('Global news and current affairs from a European perspective.'
' Much slower than the subscription based version.') ' Much slower than the print edition based version.')
oldest_article = 7.0 oldest_article = 7.0
cover_url = 'http://www.economist.com/images/covers/currentcoverus_large.jpg' cover_url = 'http://www.economist.com/images/covers/currentcoverus_large.jpg'

View File

@ -0,0 +1,122 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '08 Januery 2011, desUBIKado'
__author__ = 'desUBIKado'
__description__ = 'Daily newspaper from Biscay'
__version__ = 'v0.08'
__date__ = '08, Januery 2011'
'''
[url]http://www.elcorreo.com/[/url]
'''
import time
import re
from calibre.web.feeds.news import BasicNewsRecipe
class heraldo(BasicNewsRecipe):
__author__ = 'desUBIKado'
description = 'Daily newspaper from Biscay'
title = u'El Correo'
publisher = 'Vocento'
category = 'News, politics, culture, economy, general interest'
oldest_article = 2
delay = 1
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
language = 'es'
timefmt = '[%a, %d %b, %Y]'
encoding = 'iso-8859-1'
remove_empty_feeds = True
remove_javascript = False
feeds = [
(u'Portada', u'http://www.elcorreo.com/vizcaya/portada.xml'),
(u'Local', u'http://www.elcorreo.com/vizcaya/rss/feeds/vizcaya.xml'),
(u'Internacional', u'hhttp://www.elcorreo.com/vizcaya/rss/feeds/internacional.xml'),
(u'Econom\xeda', u'http://www.elcorreo.com/vizcaya/rss/feeds/economia.xml'),
(u'Pol\xedtica', u'http://www.elcorreo.com/vizcaya/rss/feeds/politica.xml'),
(u'Opini\xf3n', u'http://www.elcorreo.com/vizcaya/rss/feeds/opinion.xml'),
(u'Deportes', u'http://www.elcorreo.com/vizcaya/rss/feeds/deportes.xml'),
(u'Sociedad', u'http://www.elcorreo.com/vizcaya/rss/feeds/sociedad.xml'),
(u'Cultura', u'http://www.elcorreo.com/vizcaya/rss/feeds/cultura.xml'),
(u'Televisi\xf3n', u'http://www.elcorreo.com/vizcaya/rss/feeds/television.xml'),
(u'Gente', u'http://www.elcorreo.com/vizcaya/rss/feeds/gente.xml')
]
keep_only_tags = [
dict(name='div', attrs={'class':['grouphead','date','art_head','story-texto','text','colC_articulo','contenido_comentarios']}),
dict(name='div' , attrs={'id':['articulo','story-texto','story-entradilla']})
]
remove_tags = [
dict(name='div', attrs={'class':['art_barra','detalles-opinion','formdenunciar','modulo calculadoras','nubetags','pie']}),
dict(name='div', attrs={'class':['mod_lomas','bloque_lomas','blm_header','link-app3','link-app4','botones_listado']}),
dict(name='div', attrs={'class':['navegacion_galeria','modulocanalpromocion','separa','separacion','compartir','tags_relacionados']}),
dict(name='div', attrs={'class':['moduloBuscadorDeportes','modulo-gente','moddestacadopeq','OpcArt','articulopiniones']}),
dict(name='div', attrs={'class':['modulo-especial','publiEspecial']}),
dict(name='div', attrs={'id':['articulopina']}),
dict(name='br', attrs={'class':'clear'}),
dict(name='form', attrs={'name':'frm_conversor2'})
]
remove_tags_before = dict(name='div' , attrs={'class':'articulo '})
remove_tags_after = dict(name='div' , attrs={'class':'comentarios'})
def get_cover_url(self):
cover = None
st = time.localtime()
year = str(st.tm_year)
month = "%.2d" % st.tm_mon
day = "%.2d" % st.tm_mday
#[url]http://img.kiosko.net/2011/01/02/es/elcorreo.750.jpg[/url]
#[url]http://info.elcorreo.com/pdf/06012011-viz.pdf[/url]
cover='http://info.elcorreo.com/pdf/'+ day + month + year +'-viz.pdf'
br = BasicNewsRecipe.get_browser()
try:
br.open(cover)
except:
self.log("\nPortada no disponible")
cover ='http://www.elcorreo.com/vizcaya/noticias/201002/02/Media/logo-elcorreo-nuevo.png'
return cover
extra_css = '''
h1, .headline {font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:30px;}
h2, .subhead {font-family:Arial,Helvetica,sans-serif; font-style:italic; font-weight:normal;font-size:18px;}
h3, .overhead {font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:16px;}
h4 {font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:16px;}
h5 {font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:16px;}
h6 {font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:16px;}
.date,.byline, .photo {font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:14px;}
img{margin-bottom: 0.4em}
'''
preprocess_regexps = [
# To present the image of the embedded video
(re.compile(r'var RUTA_IMAGEN', re.DOTALL|re.IGNORECASE), lambda match: '</script><img src'),
(re.compile(r'.jpg";', re.DOTALL|re.IGNORECASE), lambda match: '.jpg">'),
(re.compile(r'var SITIO = "elcorreo";', re.DOTALL|re.IGNORECASE), lambda match: '<SCRIPT TYPE="text/JavaScript"'),
# To separate paragraphs with a blank line
(re.compile(r'<div class="p"', re.DOTALL|re.IGNORECASE), lambda match: '<p></p><div class="p"'),
# To put a blank line between the subtitle and the date and time of the news
(re.compile(r'<div class="date">', re.DOTALL|re.IGNORECASE), lambda match: '<br><div class="date">'),
# To put a blank line between the intro of the embedded videos and the previous text
(re.compile(r'<div class="video"', re.DOTALL|re.IGNORECASE), lambda match: '<br><div class="video"'),
# To view photos from the first when these are presented as a gallery
(re.compile(r'src="/img/shim.gif"', re.DOTALL|re.IGNORECASE), lambda match: ''),
(re.compile(r'rel=', re.DOTALL|re.IGNORECASE), lambda match: 'src='),
# To remove the link of the title
(re.compile(r'<h1 class="headline">\n<a href="', re.DOTALL|re.IGNORECASE), lambda match: '<h1 class="'),
(re.compile(r'</a>\n</h1>', re.DOTALL|re.IGNORECASE), lambda match: '</h1>'),
]

View File

@ -0,0 +1,43 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__author__ = 'Gerardo Diez'
__copyright__ = 'Gerardo Diez<gerardo.diez.garcia@gmail.com>'
description = 'Main daily newspaper from Spain - v1.00 (05, Enero 2011)'
__docformat__ = 'restructuredtext en'
'''
publico.es
'''
from calibre.web.feeds.recipes import BasicNewsRecipe
class Publico(BasicNewsRecipe):
title =u'Publico.es'
__author__ ='Gerardo Diez'
publisher =u'Mediapubli Sociedad de Publicaciones y Ediciones S.L.'
category ='news, politics, finances, world, spain, science, catalunya'
oldest_article =1
max_articles_per_feed =100
simultaneous_downloads =10
cover_url =u'http://imagenes.publico.es/css/img/logo_publico.gif'
timefmt ='[%a, %d %b, %Y]'
encoding ='utf8'
language ='es'
remove_javascript =True
no_stylesheets =True
keep_only_tags =dict(id='main')
remove_tags =[
dict(name='div', attrs={'class':['Noticias_642x50', 'contInfo ancho']}),
dict(name='ul', attrs={'class':['navComentarios', 'comentarios']}),
dict(name='div', attrs={'id':['commentsContext', 'toolbar', 'comentarios']}),
dict(name='h5', attrs={'id':'comentarios'})
]
feeds =[(u'Internacional', u'http://www.publico.es/estaticos/rss/internacional'),
(u'Espa\xf1a', u'http://www.publico.es/estaticos/rss/espana'),
(u'Dinero', u'http://www.publico.es/estaticos/rss/dinero'),
(u'Ciencias', u'http://www.publico.es/estaticos/rss/ciencias'),
(u'Culturas', u'http://www.publico.es/estaticos/rss/culturas'),
(u'Deportes', u'http://www.publico.es/estaticos/rss/deportes'),
(u'Televisi\xf3n y Gente', u'http://www.publico.es/estaticos/rss/televisionygente'),
(u'Catalu\xf1a', u'http://www.publico.es/estaticos/rss/catalunya'),
(u'Viajes', u'http://www.publico.es/estaticos/rss/viajes')]

View File

@ -1,7 +1,5 @@
#!/usr/bin/env python
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>' __copyright__ = '2009-2010, Darko Miletic <darko.miletic at gmail.com>'
''' '''
eluniversal.com.mx eluniversal.com.mx
''' '''
@ -18,75 +16,25 @@ class ElUniversal(BasicNewsRecipe):
category = 'news, politics, Mexico' category = 'news, politics, Mexico'
no_stylesheets = True no_stylesheets = True
use_embedded_content = False use_embedded_content = False
encoding = 'cp1252' encoding = 'utf8'
remove_javascript = True remove_javascript = True
language = 'es' remove_empty_feeds = True
publication_type = 'newspaper'
language = 'es'
extra_css = ''' extra_css = '''
body{font-family:Arial,Helvetica,sans-serif; font-size:x-small;} body{font-family:Arial,Helvetica,sans-serif}
.geoGris30{font-family:Georgia,"Times New Roman",Times,serif; font-size:large; color:#003366; font-weight:bold;} .noteTitle{font-family: Georgia,"Times New Roman",Times,serif; color: #336699; font-size: xx-large; font-weight: bold}
.arnegro16{font-family:Georgia,"Times New Roman",Times,serif; font-weight:bold; font-size:small;} .noteInfo{display: block; color: gray}
.tbazull2{font-family:"trebuchet ms",Arial,Helvetica,sans-serif; color:#336699; font-size:xx-small;}
.tbgrisf11{font-family:"trebuchet ms",Arial,Helvetica,sans-serif; color: #666666; font-size:xx-small;}
.verrojo13{font-family:"trebuchet ms",Arial,Helvetica,sans-serif; color: #CC0033; font-size:xx-small;}
.trnegro13{font-family:"trebuchet ms",Arial,Helvetica,sans-serif; font-size:xx-small;}
.txt-fotogaleria{font-family:"trebuchet ms",Arial,Helvetica,sans-serif; font-size:xx-small;}
''' '''
keep_only_tags = [ dict(name='table', attrs={'width':"633"}),dict(name='table', attrs={'width':"629"}),] keep_only_tags = [ dict(name='div', attrs={'id':'noteContent'})]
remove_tags_after = dict(attrs={'class':'noteText'})
remove_tags = [ remove_tags = [
dict(name='table', attrs={'bgcolor':"#f5f5f5"}), dict(attrs={'class':'noteExtras'}),
dict(name='td', attrs={'bgcolor':"#f7f8f9"}), dict(name=['meta','iframe','base','embed','object']),
dict(name='td', attrs={'bgcolor':"#f5f5f5"}), dict(attrs={'id':'tm_box'})
dict(name='table', attrs={'width':"302"}), ]
dict(name='table', attrs={'width':"214"}), remove_attributes=['lang','onclick']
dict(name='table', attrs={'width':"112"}),
dict(name='table', attrs={'width':"980"}),
dict(name='td', attrs={'height':"1"}),
dict(name='td', attrs={'height':"4"}),
dict(name='td', attrs={'height':"20"}),
dict(name='td', attrs={'height':"10"}),
dict(name='td', attrs={'class':["trrojo11","trbris11","trrojo12","arrojo12s","tbazul13"]}),
dict(name='div', attrs={'id':["mapg","ver_off_todosloscom","todosloscom"]}),
dict(name='span', attrs={'class':["trazul18b","trrojo11","trnaranja11","trbris11","georojo18b","geogris18"]}),
dict(name='span', attrs={'class':["detalles-opinion"]}),
dict(name='a', attrs={'class':["arnaranja12b","trbris11","arazul12rel","trrojo10"]}),
dict(name='img', src = "/img/icono_imprimir.gif"),
dict(name='img', src = "/img/icono_enviar_mail.gif"),
dict(name='img', src = "/img/icono_fuente_g.gif"),
dict(name='img', src = "/img/icono_fuente_m.gif"),
dict(name='img', src = "/img/icono_fuente_c.gif"),
dict(name='img', src = "/img/icono_compartir.gif"),
dict(name='img', src = "/img/icono_enviar_coment.gif"),
dict(name='img', src = "http://www.eluniversal.com.mx/n_img/bot-notasrel.gif"),
dict(name='img', src = "http://www.eluniversal.com.mx/n_img/fr.gif"),
dict(name='img', src = "/img/espiral2.gif"),
dict(name='img', src = "http://www.eluniversal.com.mx/n_img/b"),
dict(name='img', src = "/img/icono_enviar_coment.gifot-notasrel.gif"),
dict(name='img', src = "/n_img/icono_tipo3.gif"),
dict(name='img', src = "/n_img/icono_tipo2.gif"),
dict(name='img', src = "/n_img/icono_print.gif"),
dict(name='img', src = "/n_img/icono_mail2.gif"),
dict(name='img', src = "/n_img/im-comentarios-2a.gif"),
dict(name='img', src = "/n_img/im-comentarios-1a.gif"),
dict(name='img', src = "/img/icono_coment.gif"),
dict(name='img', src = "http://www.eluniversal.com.mx/n_img/bot-sitiosrel.gif"),
dict(name='img', src = "/n_img/icono_tipomenos.gif"),
dict(name='img', src = "/img/futbol/19.jpg"),
dict(name='img', alt = "Facebook"),
dict(name='img', alt = "Twitter"),
dict(name='img', alt = "Google"),
dict(name='img', alt = "LinkedIn"),
dict(name='img', alt = "Viadeo"),
dict(name='img', alt = "Digg"),
dict(name='img', alt = "Delicious"),
dict(name='img', alt = "Meneame"),
dict(name='img', alt = "Yahoo"),
dict(name='img', alt = "Technorati"),
dict(name='a',text =["Compartir","Facebook","Twitter","Google","LinkedIn","Viadeo","Digg","Delicious","Meneame","Yahoo","Technorati"]),
dict(name='select'),
dict(name='a', attrs={'class':"tbgriscompartir"}),
]
feeds = [ feeds = [
(u'Minuto por Minuto', u'http://www.eluniversal.com.mx/rss/universalmxm.xml' ) (u'Minuto por Minuto', u'http://www.eluniversal.com.mx/rss/universalmxm.xml' )
@ -101,25 +49,3 @@ class ElUniversal(BasicNewsRecipe):
,(u'Computacion' , u'http://www.eluniversal.com.mx/rss/computo.xml' ) ,(u'Computacion' , u'http://www.eluniversal.com.mx/rss/computo.xml' )
,(u'Sociedad' , u'http://www.eluniversal.com.mx/rss/sociedad.xml' ) ,(u'Sociedad' , u'http://www.eluniversal.com.mx/rss/sociedad.xml' )
] ]
# def print_version(self, url):
# return url.replace('/notas/','/notas/vi_')
def preprocess_html(self, soup):
mtag = '<meta http-equiv="Content-Language" content="es-MX"/><meta http-equiv="Content-Type" content="text/html; charset=utf-8">'
soup.head.insert(0,mtag)
for tag in soup.findAll(name='td',attrs={'class': 'arazul50'}):
tag.insert(0,"<h1>")
tag.insert(2,"</h1>")
return soup
def postprocess_html(self, soup,first):
for tag in soup.findAll(name=['table', 'span','i']):
tag.name = 'div'
for item in soup.findAll(align = "right"):
del item['align']
return soup

View File

@ -17,7 +17,7 @@ class ElPais_RSS(BasicNewsRecipe):
no_stylesheets = True no_stylesheets = True
encoding = 'cp1252' encoding = 'cp1252'
use_embedded_content = False use_embedded_content = False
language = 'es_ES' language = 'es'
remove_empty_feeds = True remove_empty_feeds = True
publication_type = 'newspaper' publication_type = 'newspaper'
masthead_url = 'http://www.elpais.com/im/tit_logo.gif' masthead_url = 'http://www.elpais.com/im/tit_logo.gif'

View File

@ -1,7 +1,5 @@
#!/usr/bin/env python
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>' __copyright__ = '2009-2011, Darko Miletic <darko.miletic at gmail.com>'
''' '''
exiledonline.com exiledonline.com
''' '''
@ -20,18 +18,20 @@ class Exiled(BasicNewsRecipe):
use_embedded_content = False use_embedded_content = False
encoding = 'utf8' encoding = 'utf8'
remove_javascript = True remove_javascript = True
language = 'en' language = 'en'
publication_type = 'newsblog'
masthead_url = 'http://exiledonline.com/wp-content/themes/exiledonline_theme/images/header-sm.gif'
extra_css = """
body{font-family: Arial,Helvetica,sans-serif}
#topslug{font-size: xx-large; font-weight: bold; color: red}
"""
cover_url = 'http://exiledonline.com/wp-content/themes/exiledonline_theme/images/header-sm.gif' conversion_options = {
'comment' : description
html2lrf_options = [ , 'tags' : category
'--comment' , description , 'publisher' : publisher
, '--base-font-size', '10' , 'language' : language
, '--category' , category }
, '--publisher' , publisher
]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
keep_only_tags = [dict(name='div', attrs={'id':'main'})] keep_only_tags = [dict(name='div', attrs={'id':'main'})]
@ -47,12 +47,13 @@ class Exiled(BasicNewsRecipe):
def preprocess_html(self, soup): def preprocess_html(self, soup):
for item in soup.findAll(style=True): for item in soup.findAll(style=True):
del item['style'] del item['style']
mtag = '\n<meta http-equiv="Content-Language" content="en"/>\n<meta http-equiv="Content-Type" content="text/html; charset=utf-8">\n' for alink in soup.findAll('a'):
soup.head.insert(0,mtag) if alink.string is not None:
tstr = alink.string
alink.replaceWith(tstr)
return soup return soup
def get_article_url(self, article): def get_article_url(self, article):
raw = article.get('link', None) raw = article.get('link', None)
final = raw + 'all/1/' final = raw + 'all/1/'
return final return final

View File

@ -1,59 +1,79 @@
#!/usr/bin/env python #!/usr/bin/env python
# -*- coding: utf-8 -*-
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>' __author__ = 'Gerardo Diez'
__copyright__ = 'Gerardo Diez<gerardo.diez.garcia@gmail.com>'
description = 'Main daily newspaper from Spain - v1.00 (05, Enero 2011)'
__docformat__ = 'restructuredtext en'
''' '''
www.expansion.com expansion.es
''' '''
from calibre.web.feeds.recipes import BasicNewsRecipe
class Publico(BasicNewsRecipe):
title =u'Expansion.com'
__author__ ='Gerardo Diez'
publisher =u'Unidad Editorial Información Económica, S.L.'
category ='finances, catalunya'
oldest_article =1
max_articles_per_feed =100
simultaneous_downloads =10
cover_url =u'http://estaticos01.expansion.com/iconos/v2.x/v2.0/cabeceras/logo_expansion.png'
timefmt ='[%A, %d %B, %Y]'
encoding ='latin'
language ='es'
remove_javascript =True
no_stylesheets =True
keep_only_tags =dict(name='div', attrs={'class':['noticia primer_elemento']})
remove_tags =[
dict(name='div', attrs={'class':['compartir', 'metadata_desarrollo_noticia', 'relacionadas', 'mas_info','publicidad publicidad_textlink', 'ampliarfoto']}),
dict(name='ul', attrs={'class':['bolos_desarrollo_noticia']}),
dict(name='span', attrs={'class':['comentarios']}),
dict(name='p', attrs={'class':['cintillo_comentarios', 'cintillo_comentarios formulario']}),
dict(name='div', attrs={'id':['comentarios_lectores_listado']})
]
feeds =[
(u'Portada', u'http://estaticos.expansion.com/rss/portada.xml'),
(u'Portada: Bolsas', u'http://estaticos.expansion.com/rss/mercados.xml'),
(u'Divisas', u'http://estaticos.expansion.com/rss/mercadosdivisas.xml'),
(u'Euribor', u'http://estaticos.expansion.com/rss/mercadoseuribor.xml'),
(u'Materias Primas', u'http://estaticos.expansion.com/rss/mercadosmateriasprimas.xml'),
(u'Renta Fija', u'http://estaticos.expansion.com/rss/mercadosrentafija.xml'),
from calibre.web.feeds.news import BasicNewsRecipe (u'Portada: Mi Dinero', u'http://estaticos.expansion.com/rss/midinero.xml'),
from calibre.ebooks.BeautifulSoup import Tag (u'Hipotecas', u'http://estaticos.expansion.com/rss/midinerohipotecas.xml'),
(u'Créditos', u'http://estaticos.expansion.com/rss/midinerocreditos.xml'),
(u'Pensiones', u'http://estaticos.expansion.com/rss/midineropensiones.xml'),
(u'Fondos de Inversión', u'http://estaticos.expansion.com/rss/midinerofondos.xml'),
(u'Motor', u'http://estaticos.expansion.com/rss/midineromotor.xml'),
class Expansion(BasicNewsRecipe): (u'Portada: Empresas', u'http://estaticos.expansion.com/rss/empresas.xml'),
title = 'Diario Expansion' (u'Banca', u'http://estaticos.expansion.com/rss/empresasbanca.xml'),
__author__ = 'Darko Miletic' (u'TMT', u'http://estaticos.expansion.com/rss/empresastmt.xml'),
description = 'Lider de informacion de mercados, economica y politica' (u'Energía', u'http://estaticos.expansion.com/rss/empresasenergia.xml'),
publisher = 'expansion.com' (u'Inmobiliario y Construcción', u'http://estaticos.expansion.com/rss/empresasinmobiliario.xml'),
category = 'news, politics, Spain' (u'Transporte y Turismo', u'http://estaticos.expansion.com/rss/empresastransporte.xml'),
oldest_article = 2 (u'Automoción e Industria', u'http://estaticos.expansion.com/rss/empresasauto-industria.xml'),
max_articles_per_feed = 100 (u'Distribución', u'http://estaticos.expansion.com/rss/empresasdistribucion.xml'),
no_stylesheets = True (u'Deporte y Negocio', u' http://estaticos.expansion.com/rss/empresasdeporte.xml'),
use_embedded_content = False (u'Mi Negocio', u'http://estaticos.expansion.com/rss/empresasminegocio.xml'),
delay = 1 (u'Interiores', u'http://estaticos.expansion.com/rss/empresasinteriores.xml'),
encoding = 'iso-8859-15' (u'Digitech', u'http://estaticos.expansion.com/rss/empresasdigitech.xml'),
language = 'es'
direction = 'ltr' (u'Portada: Economía y Política', u'http://estaticos.expansion.com/rss/economiapolitica.xml'),
(u'Política', u'http://estaticos.expansion.com/rss/economia.xml'),
(u'Portada: Sociedad', u'http://estaticos.expansion.com/rss/entorno.xml'),
html2lrf_options = [ (u'Portada: Opinión', u'http://estaticos.expansion.com/rss/opinion.xml'),
'--comment' , description (u'Llaves y editoriales', u'http://estaticos.expansion.com/rss/opinioneditorialyllaves.xml'),
, '--category' , category (u'Tribunas', u'http://estaticos.expansion.com/rss/opiniontribunas.xml'),
, '--publisher', publisher
]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"' (u'Portada: Jurídico', u'http://estaticos.expansion.com/rss/juridico.xml'),
(u'Entrevistas', u'http://estaticos.expansion.com/rss/juridicoentrevistas.xml'),
(u'Opinión', u'http://estaticos.expansion.com/rss/juridicoopinion.xml'),
(u'Sentencias', u'http://estaticos.expansion.com/rss/juridicosentencias.xml'),
feeds = [ (u'Mujer', u'http://estaticos.expansion.com/rss/mujer-empresa.xml'),
(u'Ultimas noticias', u'http://rss.expansion.com/rss/descarga.htm?data2=178') (u'Catalu&ntilde;a', u'http://estaticos.expansion.com/rss/catalunya.xml'),
,(u'Temas del dia' , u'http://rss.expansion.com/rss/descarga.htm?data2=178') (u'Función pública', u'http://estaticos.expansion.com/rss/funcion-publica.xml')
] ]
keep_only_tags = [dict(name='div', attrs={'id':'principal'})]
remove_tags = [
dict(name=['object','link','script'])
,dict(name='div', attrs={'class':['utilidades','tit_relacionadas']})
]
remove_tags_after = [dict(name='div', attrs={'class':'tit_relacionadas'})]
def preprocess_html(self, soup):
soup.html['dir' ] = self.direction
mcharset = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset=utf-8")])
soup.head.insert(0,mcharset)
for item in soup.findAll(style=True):
del item['style']
return soup

View File

@ -1,5 +1,5 @@
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>' __copyright__ = '2010-2011, Darko Miletic <darko.miletic at gmail.com>'
''' '''
ft.com ft.com
''' '''
@ -52,22 +52,38 @@ class FinancialTimes(BasicNewsRecipe):
.copyright{font-size: x-small} .copyright{font-size: x-small}
""" """
def parse_index(self): def get_artlinks(self, elem):
articles = [] articles = []
for item in elem.findAll('a',href=True):
url = self.PREFIX + item['href']
title = self.tag_to_string(item)
date = strftime(self.timefmt)
articles.append({
'title' :title
,'date' :date
,'url' :url
,'description':''
})
return articles
def parse_index(self):
feeds = []
soup = self.index_to_soup(self.INDEX) soup = self.index_to_soup(self.INDEX)
wide = soup.find('div',attrs={'class':'wide'}) wide = soup.find('div',attrs={'class':'wide'})
if wide: if not wide:
for item in wide.findAll('a',href=True): return feeds
url = self.PREFIX + item['href'] strest = wide.findAll('h3', attrs={'class':'section'})
title = self.tag_to_string(item) if not strest:
date = strftime(self.timefmt) return feeds
articles.append({ st = wide.find('h4',attrs={'class':'section-no-arrow'})
'title' :title if st:
,'date' :date strest.insert(0,st)
,'url' :url for item in strest:
,'description':'' ftitle = self.tag_to_string(item)
}) self.report_progress(0, _('Fetching feed')+' %s...'%(ftitle))
return [('FT UK edition',articles)] feedarts = self.get_artlinks(item.parent.ul)
feeds.append((ftitle,feedarts))
return feeds
def preprocess_html(self, soup): def preprocess_html(self, soup):
return self.adeify_images(soup) return self.adeify_images(soup)

View File

@ -1,4 +1,5 @@
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import Tag
import re import re
class NatureNews(BasicNewsRecipe): class NatureNews(BasicNewsRecipe):
@ -10,17 +11,76 @@ class NatureNews(BasicNewsRecipe):
max_articles_per_feed = 50 max_articles_per_feed = 50
no_stylesheets = True no_stylesheets = True
remove_tags_before = dict(name='h1', attrs={'class':'heading entry-title'}) keep_only_tags = [dict(name='div', attrs={'id':'content'})]
remove_tags_after = dict(name='h2', attrs={'id':'comments'}) # remove_tags_before = dict(name='h1', attrs={'class':'heading entry-title'})
# remove_tags_after = dict(name='h2', attrs={'id':'comments'})
remove_tags = [ remove_tags = [
dict(name='h2', attrs={'id':'comments'}), dict(name='h2', attrs={'id':'comments'}),
dict(attrs={'alt':'Advertisement'}), dict(attrs={'alt':'Advertisement'}),
dict(name='div', attrs={'class':'ad'}), dict(name='div', attrs={'class':'ad'}),
dict(attrs={'class':'Z3988'}),
dict(attrs={'class':['formatpublished','type-of-article','cleardiv','disclaimer','buttons','comments xoxo']}),
dict(name='a', attrs={'href':'#comments'}),
dict(name='h2',attrs={'class':'subheading plusicon icon-add-comment'})
] ]
preprocess_regexps = [ preprocess_regexps = [
(re.compile(r'<p>ADVERTISEMENT</p>', re.DOTALL|re.IGNORECASE), lambda match: ''), (re.compile(r'<p>ADVERTISEMENT</p>', re.DOTALL|re.IGNORECASE), lambda match: ''),
] ]
extra_css = '''
.author { text-align: right; font-size: small; line-height:1em; margin-top:0px; margin-left:0; margin-right:0; margin-bottom: 0; }
.imagedescription { font-size: small; font-style:italic; line-height:1em; margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; }
.imagecredit { font-size: x-small; font-style: normal; font-weight: bold}
'''
feeds = [('Nature News', 'http://feeds.nature.com/news/rss/most_recent')] feeds = [('Nature News', 'http://feeds.nature.com/news/rss/most_recent')]
def preprocess_html(self,soup):
# The author name is slightly buried - dig it up
author = soup.find('p', {'class':'byline'})
if author:
# Find out the author's name
authornamediv = author.find('span',{'class':'author fn'})
authornamelink = authornamediv.find('a')
if authornamelink:
authorname = authornamelink.contents[0]
else:
authorname = authornamediv.contents[0]
# Stick the author's name in the byline tag
tag = Tag(soup,'div')
tag['class'] = 'author'
tag.insert(0,authorname.strip())
author.replaceWith(tag)
# Change the intro from a p to a div
intro = soup.find('p',{'class':'intro'})
if intro:
tag = Tag(soup,'div')
tag['class'] = 'intro'
tag.insert(0,intro.contents[0])
intro.replaceWith(tag)
# Change span class=imagedescription to div
descr = soup.find('span',{'class':'imagedescription'})
if descr:
tag = Tag(soup,'div')
tag['class'] = 'imagedescription'
tag.insert(0,descr.renderContents())
descr.replaceWith(tag)
# The references are in a list, let's make them simpler
reflistcont = soup.find('ul',{'id':'article-refrences'})
if reflistcont:
reflist = reflistcont.li.renderContents()
tag = Tag(soup,'div')
tag['class'] = 'article-references'
tag.insert(0,reflist)
reflistcont.replaceWith(tag)
# Within the id=content div, we need to remove all the stuff after the end of the class=entry-content
entrycontent = soup.find('div',{'class':'entry-content'})
for nextSibling in entrycontent.findNextSiblings():
nextSibling.extract()
return soup

View File

@ -8,12 +8,13 @@ __docformat__ = 'restructuredtext en'
globeandmail.com globeandmail.com
''' '''
import re
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
class AdvancedUserRecipe1287083651(BasicNewsRecipe): class AdvancedUserRecipe1287083651(BasicNewsRecipe):
title = u'Globe & Mail' title = u'Globe & Mail'
__license__ = 'GPL v3' __author__ = 'Kovid Goyal'
__author__ = 'Szing'
oldest_article = 2 oldest_article = 2
no_stylesheets = True no_stylesheets = True
max_articles_per_feed = 100 max_articles_per_feed = 100
@ -38,24 +39,19 @@ class AdvancedUserRecipe1287083651(BasicNewsRecipe):
(u'Sports', u'http://www.theglobeandmail.com/auto/?service=rss') (u'Sports', u'http://www.theglobeandmail.com/auto/?service=rss')
] ]
keep_only_tags = [ preprocess_regexps = [
dict(name='h1'), (re.compile(r'<head.*?</head>', re.DOTALL), lambda m: '<head></head>'),
dict(name='h2', attrs={'id':'articletitle'}), (re.compile(r'<script.*?</script>', re.DOTALL), lambda m: ''),
dict(name='p', attrs={'class':['leadText', 'meta', 'leadImage', 'redtext byline', 'bodyText']}), ]
dict(name='div', attrs={'class':['news','articlemeta','articlecopy']}),
dict(name='id', attrs={'class':'article'}),
dict(name='table', attrs={'class':'todays-market'}),
dict(name='header', attrs={'id':'leadheader'})
]
remove_tags_before = dict(name='h1')
remove_tags = [ remove_tags = [
dict(name='div', attrs={'id':['tabInside', 'ShareArticles', 'topStories']}) dict(name='div', attrs={'id':['ShareArticles', 'topStories']}),
] dict(href=lambda x: x and 'tracking=' in x),
{'class':['articleTools', 'pagination', 'Ads', 'topad',
#this has to be here or the text in the article appears twice. 'breadcrumbs', 'footerNav', 'footerUtil', 'downloadlinks']}]
remove_tags_after = [dict(id='article')]
#Use the mobile version rather than the web version #Use the mobile version rather than the web version
def print_version(self, url): def print_version(self, url):
return url + '&service=mobile' return url.rpartition('?')[0] + '?service=mobile'

View File

@ -3,29 +3,31 @@ __license__ = 'GPL v3'
__copyright__ = '04 December 2010, desUBIKado' __copyright__ = '04 December 2010, desUBIKado'
__author__ = 'desUBIKado' __author__ = 'desUBIKado'
__description__ = 'Daily newspaper from Aragon' __description__ = 'Daily newspaper from Aragon'
__version__ = 'v0.03' __version__ = 'v0.04'
__date__ = '11, December 2010' __date__ = '6, Januery 2011'
''' '''
[url]http://www.heraldo.es/[/url] [url]http://www.heraldo.es/[/url]
''' '''
import time import time
import re
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
class heraldo(BasicNewsRecipe): class heraldo(BasicNewsRecipe):
__author__ = 'desUBIKado' __author__ = 'desUBIKado'
description = 'Daily newspaper from Aragon' description = 'Daily newspaper from Aragon'
title = u'Heraldo de Aragon' title = u'Heraldo de Aragon'
publisher = 'OJD Nielsen' publisher = 'OJD Nielsen'
category = 'News, politics, culture, economy, general interest' category = 'News, politics, culture, economy, general interest'
language = 'es' language = 'es'
timefmt = '[%a, %d %b, %Y]' timefmt = '[%a, %d %b, %Y]'
oldest_article = 1 oldest_article = 2
delay = 1
max_articles_per_feed = 100 max_articles_per_feed = 100
use_embedded_content = False use_embedded_content = False
remove_javascript = True remove_javascript = True
no_stylesheets = True no_stylesheets = True
recursion = 10
feeds = [ feeds = [
(u'Portadas', u'http://www.heraldo.es/index.php/mod.portadas/mem.rss') (u'Portadas', u'http://www.heraldo.es/index.php/mod.portadas/mem.rss')
@ -37,29 +39,39 @@ class heraldo(BasicNewsRecipe):
remove_tags = [dict(name='a', attrs={'class':['com flo-r','enl-if','enl-df']}), remove_tags = [dict(name='a', attrs={'class':['com flo-r','enl-if','enl-df']}),
dict(name='div', attrs={'class':['brb-b-s con marg-btt','cnt-rel con']}), dict(name='div', attrs={'class':['brb-b-s con marg-btt','cnt-rel con']}),
dict(name='form', attrs={'class':'form'})] dict(name='form', attrs={'class':'form'}),
dict(name='ul', attrs={'id':['cont-tags','pag-1']})]
remove_tags_before = dict(name='div' , attrs={'id':'dts'}) remove_tags_before = dict(name='div' , attrs={'id':'dts'})
remove_tags_after = dict(name='div' , attrs={'id':'com'}) remove_tags_after = dict(name='div' , attrs={'id':'com'})
def get_cover_url(self): def get_cover_url(self):
cover = None cover = None
st = time.localtime() st = time.localtime()
year = str(st.tm_year) year = str(st.tm_year)
month = "%.2d" % st.tm_mon month = "%.2d" % st.tm_mon
day = "%.2d" % st.tm_mday day = "%.2d" % st.tm_mday
#[url]http://oldorigin-www.heraldo.es/20101211/primeras/portada_aragon.pdf[/url] #[url]http://oldorigin-www.heraldo.es/20101211/primeras/portada_aragon.pdf[/url]
cover='http://oldorigin-www.heraldo.es/'+ year + month + day +'/primeras/portada_aragon.pdf' cover='http://oldorigin-www.heraldo.es/'+ year + month + day +'/primeras/portada_aragon.pdf'
br = BasicNewsRecipe.get_browser() br = BasicNewsRecipe.get_browser()
try: try:
br.open(cover) br.open(cover)
except: except:
self.log("\nPortada no disponible") self.log("\nPortada no disponible")
cover ='http://www.heraldo.es/MODULOS/global/publico/interfaces/img/logo-Heraldo.png' cover ='http://www.heraldo.es/MODULOS/global/publico/interfaces/img/logo-Heraldo.png'
return cover return cover
extra_css = ''' extra_css = '''
h2{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:xx-large;} .con strong{font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:16px;}
''' .con h2{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:30px;}
.con span{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:12px;}
.ent {font-family:Arial,Helvetica,sans-serif; font-weight:normal; font-style:italic; font-size:18px;}
img{margin-bottom: 0.4em}
'''
preprocess_regexps = [
# To separate the comments with a blank line
(re.compile(r'<div id="com"', re.DOTALL|re.IGNORECASE), lambda match: '<br><div id="com"')
]

View File

@ -0,0 +1,23 @@
from calibre.web.feeds.news import BasicNewsRecipe
class AdvancedUserRecipe1293122276(BasicNewsRecipe):
title = u'Smarter Planet | Tumblr for eReaders'
__author__ = 'Jack Mason'
author = 'IBM Global Business Services'
publisher = 'IBM'
category = 'news, technology, IT, internet of things, analytics'
oldest_article = 7
max_articles_per_feed = 30
no_stylesheets = True
use_embedded_content = False
masthead_url = 'http://30.media.tumblr.com/tumblr_l70dow9UmU1qzs4rbo1_r3_250.jpg'
remove_tags_before = dict(id='item')
remove_tags_after = dict(id='item')
remove_tags = [dict(attrs={'class':['sidebar', 'about', 'footer', 'description,' 'disqus', 'nav', 'notes', 'disqus_thread']}),
dict(id=['sidebar', 'footer', 'disqus', 'nav', 'notes', 'likes_container', 'description', 'disqus_thread', 'about']),
dict(name=['script', 'noscript', 'style'])]
feeds = [(u'Smarter Planet Tumblr', u'http://smarterplanet.tumblr.com/mobile/rss')]

View File

@ -0,0 +1,182 @@
import re, time
from calibre import strftime
from calibre.web.feeds.recipes import BasicNewsRecipe
class IHNed(BasicNewsRecipe):
stahnout_vsechny = False
#True = stahuje vsechny z homepage
#False = stahuje pouze dnesni clanky (ze dne, kdy je skript spusten)
title = 'iHNed'
__author__ = 'Karel Bílek'
language = 'cs'
description = 'Zprávy z iHNed.cz'
timefmt = ' [%a, %d %b, %Y]'
needs_subscription = False
remove_tags = [dict(attrs={'class':['borderbottom', 'web', 'foot', 'reklama', 'd-elm d-rellinks', 'd-elm']}),
dict(style=['text-align: center;']),
dict(id=['r-bfull']),
dict(name=['script', 'noscript', 'style'])]
encoding = 'windows-1250'
no_stylesheets = True
remove_tags_before = dict(attrs={'class':'d-nadtit'})
remove_tags_after = dict(attrs={'class':'like'})
conversion_options = {
'linearize_tables' : True,
}
def preprocess_html(self, soup):
def makeurl(wat):
return "http://ihned.cz"+wat;
for h1 in soup.findAll('h1'):
a = h1.find('a')
if a:
string = a.string
if string:
soup.a.replaceWith(string)
for a in soup.findAll('a', href=True) :
cil = str(a['href'])
if cil.startswith("/") or cil.startswith("index"):
a['href'] = makeurl(cil)
return soup
def parse_index(self):
def makeurl(wat):
if wat.startswith("/") or wat.startswith("index"):
return "http://ihned.cz"+wat;
else:
return wat
articles = {} #vysledek, asi
key = None #soucasna sekce
ans = [] #vsechny sekce
articles["Hlavní"] = []
ans.append("Hlavní")
was = {}
def parse_subpage(url, name):
articles[name] = []
ans.append(name)
soup = self.index_to_soup(url)
otvirak = soup.find(True, attrs={'class':['otv']})
if otvirak:
#the code is copypasted here because I don't know python. simple as that.
a = otvirak.find('a', href=True)
title = self.tag_to_string(a, use_alt=True).strip()
txt = otvirak.find(True, attrs={'class':['txt']})
description = ''
if txt:
match = re.match(r'<div class="txt">\s*([^<]*)\s*<a', str(txt), re.L)
if match:
description = match.group(1)
pubdate = strftime('%d. %m.')
if not title in was:
articles[name].append(
dict(title=title, url=makeurl(a['href']), date=pubdate,
description=description,
content=''))
otv234 = soup.find(True, attrs={'class':['otv234', 'col2a']})
if otv234:
for ow in otv234.findAll(True, attrs={'class':['ow']}):
a = ow.find('a', href=True)
title = self.tag_to_string(a, use_alt=True).strip()
description=''
prx = ow.find(True, attrs={'class':['prx']});
if prx:
description = str(prx.string)
nfo = ow.find(True, attrs={'class':['nfo']});
pubdate = ''
if nfo:
dtime = time.localtime();
day = dtime[2]
month = dtime[1]
pubdate = strftime('%d. %m.')
match = re.search(r'([0-9]*)\.([0-9]*)\.', str(nfo))
if self.stahnout_vsechny or (int(day) == int(match.group(1)) and int(month) == int(match.group(2))):
if not title in was:
articles[name].append(
dict(title=title, url=makeurl(a['href']), date=pubdate,
description=description,
content=''))
soup = self.index_to_soup('http://ihned.cz/')
otvirak = soup.find(True, attrs={'class':['otv']})
if otvirak:
a = otvirak.find('a', href=True)
title = self.tag_to_string(a, use_alt=True).strip()
txt = otvirak.find(True, attrs={'class':['txt']})
description = ''
if txt:
match = re.match(r'<div class="txt">\s*([^<]*)\s*<a', str(txt), re.L)
if match:
description = match.group(1)
pubdate = strftime('%d. %m.')
feed = "Hlavní"
articles[feed].append(
dict(title=title, url=(a['href']), date=pubdate,
description=description,
content=''))
was[title]=1
otvirak2345 = soup.find(True, attrs={'class':['otv2345']})
if otvirak2345:
for otv2 in otvirak2345.findAll(True, attrs={'class':['otv2-5']}):
a = otv2.find('a', attrs={'class':['tit2']}, href=True)
title = self.tag_to_string(a, use_alt=True).strip()
description=''
span = otv2.find('span');
if span:
match = re.match(r'<span>\s*([^<]*)\s*<a', str(span), re.L)
if match:
description = match.group(1)
feed = "Hlavní"
pubdate = strftime('%d. %m.')
articles[feed].append(
dict(title=title, url=(a['href']), date=pubdate,
description=description,
content=''))
was[title]=1
parse_subpage("http://komentare.ihned.cz/", "Komentáře")
parse_subpage("http://domaci.ihned.cz", "Domácí")
parse_subpage("http://ekonomika.ihned.cz", "Ekonomika")
parse_subpage("http://zahranicni.ihned.cz/", "Zahraničí");
parse_subpage("http://finweb.ihned.cz/", "Finance");
parse_subpage("http://digiweb.ihned.cz/", "DigiWeb");
parse_subpage("http://kultura.ihned.cz/", "Kultura")
parse_subpage("http://sport.ihned.cz/", "Sport");
#seradi kategorie
ans = self.sort_index_by(ans, {'Hlavni':1, 'Domácí':2, 'Ekonomika':5, 'Zahraničí':3, 'Finance':6, 'DigiWeb':7, 'Kultura':8, 'Sport':9, 'Komentáře':4})
#vrati, ale pouze, kdyz je v kategoriich...
ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
return ans

View File

@ -0,0 +1,52 @@
import re
from calibre.web.feeds.news import BasicNewsRecipe
class KANewsRecipe(BasicNewsRecipe):
title = u'KA-News.de'
description = u'Nachrichten aus Karlsruhe, Deutschland und der Welt.'
__author__ = 'tfeld'
lang='de'
no_stylesheets = True
oldest_article = 7
max_articles_per_feed = 100
feeds = [
(u'News aus Karlsruhe', 'http://www.ka-news.de/storage/rss/rss/karlsruhe.xml'),
(u'Kulturnachrichten aus Karlsruhe', 'http://www.ka-news.de/storage/rss/rss/kultur.xml'),
(u'Durlach: News aus Durlach', 'http://www.ka-news.de/storage/rss/rss/durlach.xml'),
(u'Stutensee: News aus Stutensee Blankenloch, Büchig, Friedrichstal, Staffort, Spöck', 'http://www.ka-news.de/storage/rss/rss/stutensee.xml'),
(u'Bruchsal: News aus Bruchsal', 'http://www.ka-news.de/storage/rss/rss/bruchsal.xml'),
(u'Wirtschaftsnews aus Karlsruhe', 'http://www.ka-news.de/storage/rss/rss/wirtschaft.xml'),
(u'ka-news.de - Sport', 'http://www.ka-news.de/storage/rss/rss/sport.xml'),
(u'KSC-News - News rund um den KSC', 'http://www.ka-news.de/storage/rss/rss/ksc.xml'),
(u'ka-news.de - BG Karlsruhe', 'http://www.ka-news.de/storage/rss/rss/basketball.xml')
]
preprocess_regexps = [
(re.compile(r'width:[0-9]*?px', re.DOTALL|re.IGNORECASE), lambda match: ''),
]
remove_tags_before = dict(id='artdetail_ueberschrift')
remove_tags_after = dict(id='artdetail_unterzeile')
remove_tags = [dict(name=['div'], attrs={'class': 'lbx_table'}),
dict(name=['div'], attrs={'class': 'lk_zumthema'}),
dict(name=['div'], attrs={'class': 'lk_thumb'}),
dict(name=['div'], attrs={'class': 'lk_trenner'}),
dict(name=['div'], attrs={'class': 'lupen_container'}),
dict(name=['script']),
dict(name=['span'], attrs={'style': 'display:none;'}),
dict(name=['span'], attrs={'class': 'comm_info'}),
dict(name=['h3'], attrs={'id': 'artdetail_unterzeile'})]
# removing style attribute _after_ removing specifig tags above
remove_attributes = ['width','height','style']
extra_css = '''
h1{ font-size:large; font-weight:bold; }
h2{ font-size:medium; font-weight:bold; }
'''
def get_cover_url(self):
return 'http://www.ka-news.de/storage/scl/techkanews/logos/434447_m1t1w250q75s1v29681_ka-news-Logo_mit_Schatten_transparent.png'

View File

@ -0,0 +1,16 @@
from calibre.web.feeds.news import BasicNewsRecipe
class AdvancedUserRecipe1295262156(BasicNewsRecipe):
title = u'kath.net'
__author__ = 'Bobus'
oldest_article = 7
max_articles_per_feed = 100
feeds = [(u'kath.net', u'http://www.kath.net/2005/xml/index.xml')]
def print_version(self, url):
return url+"&print=yes"
extra_css = 'td.textb {font-size: medium;}'

View File

@ -28,6 +28,8 @@ class LaRepubblica(BasicNewsRecipe):
recursion = 10 recursion = 10
remove_javascript = True remove_javascript = True
no_stylesheets = True
def get_article_url(self, article): def get_article_url(self, article):
link = article.get('id', article.get('guid', None)) link = article.get('id', article.get('guid', None))
if link is None: if link is None:

View File

@ -1,4 +1,3 @@
#!/usr/bin/env python
__license__ = 'GPL v3' __license__ = 'GPL v3'
__author__ = 'Lorenzo Vigentini' __author__ = 'Lorenzo Vigentini'
__copyright__ = '2009, Lorenzo Vigentini <l.vigentini at gmail.com>' __copyright__ = '2009, Lorenzo Vigentini <l.vigentini at gmail.com>'
@ -14,7 +13,7 @@ from calibre.web.feeds.news import BasicNewsRecipe
class ledevoir(BasicNewsRecipe): class ledevoir(BasicNewsRecipe):
author = 'Lorenzo Vigentini' author = 'Lorenzo Vigentini'
description = 'Canadian Paper' description = 'Canadian Paper. A subscription is optional, with it you get more content'
cover_url = 'http://www.ledevoir.com/images/ul/graphiques/logo_devoir.gif' cover_url = 'http://www.ledevoir.com/images/ul/graphiques/logo_devoir.gif'
title = u'Le Devoir' title = u'Le Devoir'
@ -28,6 +27,7 @@ class ledevoir(BasicNewsRecipe):
max_articles_per_feed = 50 max_articles_per_feed = 50
use_embedded_content = False use_embedded_content = False
recursion = 10 recursion = 10
needs_subscription = 'optional'
remove_javascript = True remove_javascript = True
no_stylesheets = True no_stylesheets = True
@ -77,3 +77,12 @@ class ledevoir(BasicNewsRecipe):
.credit {color:#787878;font-size:0.71em;line-height:1.1em;font-weight:bold;} .credit {color:#787878;font-size:0.71em;line-height:1.1em;font-weight:bold;}
.texte {font-size:1.15em;line-height:1.4em;margin-bottom:17px;} .texte {font-size:1.15em;line-height:1.4em;margin-bottom:17px;}
''' '''
def get_browser(self):
br = BasicNewsRecipe.get_browser()
if self.username is not None and self.password is not None:
br.open('http://www.ledevoir.com')
br.select_form(nr=1)
br['login[courriel]'] = self.username
br['login[password]'] = self.password
br.submit()
return br

View File

@ -0,0 +1,32 @@
from calibre.web.feeds.news import BasicNewsRecipe
class AdvancedUserRecipe1295081935(BasicNewsRecipe):
title = u'Mail & Guardian ZA News'
__author__ = '77ja65'
language = 'en'
oldest_article = 7
max_articles_per_feed = 30
no_stylesheets = True
masthead_url = 'http://c1608832.cdn.cloudfiles.rackspacecloud.com/mg_logo.gif'
remove_tags_after = [dict(id='content')]
feeds = [
(u'National News', u'http://www.mg.co.za/rss/national'),
(u'Top Stories', u'http://www.mg.co.za/rss'),
(u'Africa News', u'http://www.mg.co.za/rss/africa'),
(u'Sport', u'http://www.mg.co.za/rss/sport'),
(u'Business', u'http://www.mg.co.za/rss/business'),
(u'And In Other News', u'http://www.mg.co.za/rss/and-in-other-news'),
(u'World News', u'http://www.mg.co.za/rss/world')
]
def print_version(self, url):
return url.replace('http://www.mg.co.za/article/',
'http://www.mg.co.za/printformat/single/')
extra_css = '''
h1{font-family:Arial,Helvetica,sans-serif; font-
weight:bold;font-size:large;}
h2{font-family:Arial,Helvetica,sans-serif; font-
weight:normal;font-size:small;}
'''

View File

@ -1,10 +1,9 @@
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>' __copyright__ = '2010-2011, Darko Miletic <darko.miletic at gmail.com>'
''' '''
msnbc.msn.com msnbc.msn.com
''' '''
import re
from calibre.web.feeds.recipes import BasicNewsRecipe from calibre.web.feeds.recipes import BasicNewsRecipe
class MsNBC(BasicNewsRecipe): class MsNBC(BasicNewsRecipe):
@ -19,7 +18,16 @@ class MsNBC(BasicNewsRecipe):
publisher = 'msnbc.com' publisher = 'msnbc.com'
category = 'news, USA, world' category = 'news, USA, world'
language = 'en' language = 'en'
extra_css = ' body{ font-family: sans-serif } .head{font-family: serif; font-size: xx-large; font-weight: bold; color: #CC0000} .abstract{font-weight: bold} .source{font-size: small} .updateTime{font-size: small} ' extra_css = """
body{ font-family: Georgia,Times,serif }
.hide{display: none}
.caption{font-family: Arial,sans-serif; font-size: x-small}
.entry-summary{font-family: Arial,sans-serif}
.copyright{font-size: 0.95em; font-style: italic}
.source-org{font-size: small; font-family: Arial,sans-serif}
img{display: block; margin-bottom: 0.5em}
span.byline{display: none}
"""
conversion_options = { conversion_options = {
'comments' : description 'comments' : description
@ -28,14 +36,20 @@ class MsNBC(BasicNewsRecipe):
,'publisher': publisher ,'publisher': publisher
} }
preprocess_regexps = [ remove_tags_before = dict(name='h1', attrs={'id':'headline'})
(re.compile(r'</style></head>', re.DOTALL|re.IGNORECASE),lambda match: '</style>') remove_tags_after = dict(name='span', attrs={'class':['copyright','Linear copyright']})
,(re.compile(r'<div class="head">', re.DOTALL|re.IGNORECASE),lambda match: '</head><body><div class="head">'), keep_only_tags=[
] dict(attrs={'id':['headline','deck','byline','source','intelliTXT']})
,dict(attrs={'class':['gl_headline','articleText','drawer-content Linear','v-center3','byline','textBodyBlack']})
]
remove_attributes=['property','lang','rel','xmlns:fb','xmlns:v','xmlns:dc','xmlns:dcmitype','xmlns:og','xmlns:media','xmlns:vcard','typeof','itemscope','itemtype','itemprop','about','type','size','width','height','onreadystatechange','data','border','hspace','vspace']
remove_tags = [
dict(name=['iframe','object','link','embed','meta','table'])
,dict(name='span', attrs={'class':['copyright','Linear copyright']})
,dict(name='div', attrs={'class':'social'})
]
remove_tags_before = dict(name='div', attrs={'class':'head'})
remove_tags_after = dict(name='div', attrs={'class':'copyright'})
remove_tags = [dict(name=['iframe','object','link','script','form'])]
feeds = [ feeds = [
(u'US News' , u'http://rss.msnbc.msn.com/id/3032524/device/rss/rss.xml' ) (u'US News' , u'http://rss.msnbc.msn.com/id/3032524/device/rss/rss.xml' )
@ -48,11 +62,26 @@ class MsNBC(BasicNewsRecipe):
,(u'Tech & Science', u'http://rss.msnbc.msn.com/id/3032117/device/rss/rss.xml' ) ,(u'Tech & Science', u'http://rss.msnbc.msn.com/id/3032117/device/rss/rss.xml' )
] ]
def print_version(self, url):
return url + 'print/1/displaymode/1098/'
def preprocess_html(self, soup): def preprocess_html(self, soup):
for item in soup.head.findAll('div'): for item in soup.body.findAll('html'):
item.extract() item.name='div'
for item in soup.body.findAll('div'):
if item.has_key('id') and item['id'].startswith('vine-'):
item.extract()
if item.has_key('class') and ( item['class'].startswith('ad') or item['class'].startswith('vine')):
item.extract()
for item in soup.body.findAll('img'):
if not item.has_key('alt'):
item['alt'] = 'image'
for item in soup.body.findAll('ol'):
if item.has_key('class') and item['class'].startswith('grid'):
item.extract()
for item in soup.body.findAll('span'):
if ( item.has_key('id') and item['id'].startswith('byLine') and item.string is None) or ( item.has_key('class') and item['class'].startswith('inline') ):
item.extract()
for alink in soup.findAll('a'):
if alink.string is not None:
tstr = alink.string
alink.replaceWith(tstr)
return soup return soup

View File

@ -0,0 +1,74 @@
__license__ = 'GPL 3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
from calibre.web.feeds.news import BasicNewsRecipe
class AdvancedUserRecipe1294342201(BasicNewsRecipe):
title = u'New London Day'
__author__ = 'Being'
description = 'State, local and business news from New London, CT'
language = 'en_GB'
oldest_article = 1
max_articles_per_feed = 200
use_embedded_content = False
no_stylesheets = True
remove_javascript = True
remove_tags_before = dict(id='article')
remove_tags_after = dict(id='article')
remove_tags = [dict(attrs={'class':['articleTools', 'post-tools', 'side_tool', 'nextArticleLink clearfix']}),
dict(id=['footer', 'toolsRight', 'articleInline', 'navigation', 'archive', 'side_search', 'blog_sidebar', 'side_tool', 'side_index']),
dict(name=['script', 'noscript', 'style'])]
remove_tags_after = [ {'class':['photo_article',]} ]
remove_tags = [{'id':["moduleArticleTools","content-bottom","rail","articleRelates module","toolSet","relatedrailcontent","div-wrapper","beta","atp-comments","footer"]},
{'class':["clearfix","relatedTitle","articleRelates module","asset-footer","tools","comments","featurePromo","featurePromo fp-topjobs brownBackground","clearfix fullSpan brownBackground","curvedContent"]},
dict(name='font',attrs={'id':["cr-other-headlines"]})]
extra_css = '''
h1{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:large;}
h2{font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:small;}
.byline {font-family:Arial,Helvetica,sans-serif; font-size:xx-small;}
.date {font-family:Arial,Helvetica,sans-serif; font-size:xx-small;}
p{font-family:Arial,Helvetica,sans-serif;font-size:small;}
.copyright {font-family:Arial,Helvetica,sans-serif;font-size:xx-small;text-align:center}
.story{font-family:Arial,Helvetica,sans-serif;font-size:small;}
.entry-asset asset hentry{font-family:Arial,Helvetica,sans-serif;font-size:small;}
.pagebody{font-family:Arial,Helvetica,sans-serif;font-size:small;}
.maincontentcontainer{font-family:Arial,Helvetica,sans-serif;font-size:small;}
.story-body{font-family:Arial,Helvetica,sans-serif;font-size:small;}
body{font-family:Helvetica,Arial,sans-serif;font-size:small;}
'''
feeds = [
(u'All News', u'http://www.theday.com/section/rss'),
(u'Breaking News', u'http://www.theday.com/section/rss01'),
(u'Police and Courts', u'http://www.theday.com/section/rss02'),
(u'State News', u'http://www.theday.com/section/rss03'),
(u'Local Business', u'http://www.theday.com/section/rss04'),
(u'Entertainment', u'http://www.theday.com/section/rss05'),
(u'Opinion', u'http://www.theday.com/section/rss06'),
(u'Casinos', u'http://www.theday.com/section/rss12'),
(u'Defense and Military', u'http://www.theday.com/section/rss14'),
(u'Ann Baldelli Ruminations', u'http://www.theday.com/section/rss20'),
(u'Paul Choiniere Ruminations', u'http://www.theday.com/section/rss21'),
(u'Michael Costanza Omnivore', u'http://www.theday.com/section/rss23'),
(u'Rebecca Dangelo Reel Life', u'http://www.theday.com/section/rss25'),]
def print_version(self, url):
return url.replace('/index.html', '/print.html')
def get_article_url(self, article):
return article.get('feedburner_origlink', article.get('guid', article.get('link')))
def postprocess_html(self, soup, first_fetch):
for t in soup.findAll(['table', 'tr', 'td']):
t.name = 'div'
for tag in soup.findAll('form', dict(attrs={'name':["comments_form"]})):
tag.extract()
for tag in soup.findAll('font', dict(attrs={'id':["cr-other-headlines"]})):
tag.extract()
return soup

View File

@ -27,6 +27,9 @@ class NikkeiNet_sub_economy(BasicNewsRecipe):
{'class':"JSID_basePageMove JSID_baseAsyncSubmit cmn-form_area JSID_optForm_utoken"}, {'class':"JSID_basePageMove JSID_baseAsyncSubmit cmn-form_area JSID_optForm_utoken"},
{'class':"cmn-article_keyword cmn-clearfix"}, {'class':"cmn-article_keyword cmn-clearfix"},
{'class':"cmn-print_headline cmn-clearfix"}, {'class':"cmn-print_headline cmn-clearfix"},
{'class':"cmn-article_list"},
dict(id="ABOUT-NIKKEI"),
{'class':"cmn-sub_market"},
] ]
remove_tags_after = {'class':"cmn-pr_list"} remove_tags_after = {'class':"cmn-pr_list"}

View File

@ -0,0 +1,32 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
__license__ = 'GPL v3'
__copyright__ = u'Chema Cort\xe9s - 2011-01-05'
__version__ = 'v0.01'
__date__ = '2011-01-05'
'''
njp.org
'''
from calibre.web.feeds.news import BasicNewsRecipe
class NewJournalOfPhysics(BasicNewsRecipe):
title = u'New Journal of Physics'
__author__ = u'Chema Cort\xe9s'
description = u'The open-access journal for physics'
publisher = u'IOP (Institute of Physics)'
category = 'physics, journal, science'
language = 'en'
oldest_article = 30
max_articles_per_feed = 100
keep_only_tags = [dict(id=['fulltextContainer'])]
no_stylesheets=True
use_embedded_content=False
feeds = [(u'Latest Papers', u'http://iopscience.iop.org/1367-2630/?rss=1')]
def print_version(self, url):
return url+"/fulltext"

View File

@ -685,3 +685,28 @@ class NYTimes(BasicNewsRecipe):
divTag.replaceWith(tag) divTag.replaceWith(tag)
return soup return soup
def populate_article_metadata(self, article, soup, first):
shortparagraph = ""
try:
if len(article.text_summary.strip()) == 0:
articlebodies = soup.findAll('div',attrs={'class':'articleBody'})
if articlebodies:
for articlebody in articlebodies:
if articlebody:
paras = articlebody.findAll('p')
for p in paras:
refparagraph = self.massageNCXText(self.tag_to_string(p,use_alt=False)).strip()
#account for blank paragraphs and short paragraphs by appending them to longer ones
if len(refparagraph) > 0:
if len(refparagraph) > 70: #approximately one line of text
article.summary = article.text_summary = shortparagraph + refparagraph
return
else:
shortparagraph = refparagraph + " "
if shortparagraph.strip().find(" ") == -1 and not shortparagraph.strip().endswith(":"):
shortparagraph = shortparagraph + "- "
except:
self.log("Error creating article descriptions")
return

View File

@ -685,4 +685,27 @@ class NYTimes(BasicNewsRecipe):
divTag.replaceWith(tag) divTag.replaceWith(tag)
return soup return soup
def populate_article_metadata(self, article, soup, first):
shortparagraph = ""
try:
if len(article.text_summary.strip()) == 0:
articlebodies = soup.findAll('div',attrs={'class':'articleBody'})
if articlebodies:
for articlebody in articlebodies:
if articlebody:
paras = articlebody.findAll('p')
for p in paras:
refparagraph = self.massageNCXText(self.tag_to_string(p,use_alt=False)).strip()
#account for blank paragraphs and short paragraphs by appending them to longer ones
if len(refparagraph) > 0:
if len(refparagraph) > 70: #approximately one line of text
article.summary = article.text_summary = shortparagraph + refparagraph
return
else:
shortparagraph = refparagraph + " "
if shortparagraph.strip().find(" ") == -1 and not shortparagraph.strip().endswith(":"):
shortparagraph = shortparagraph + "- "
except:
self.log("Error creating article descriptions")
return

View File

@ -0,0 +1,61 @@
__license__ = 'GPL v3'
__copyright__ = '2011, Darko Miletic <darko.miletic at gmail.com>'
'''
pressthink.org
'''
from calibre.web.feeds.recipes import BasicNewsRecipe
class PressThink(BasicNewsRecipe):
title = 'PressThink'
__author__ = 'Darko Miletic'
description = 'Ghost of democracy in the media machine'
oldest_article = 60
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
encoding = 'utf8'
publisher = 'Arthur L. Carter Journalism Institute'
category = 'news, USA, world, economy, politics, media'
language = 'en'
publication_type = 'blog'
extra_css = """
body{ font-family: Helvetica,Arial,sans-serif }
img{display: block; margin-bottom: 0.5em}
h6{font-size: 1.1em; font-weight: bold}
.post-author{font-family: Georgia,serif}
.post-title{color: #AB0000}
.says{color: gray}
.comment {
border-bottom: 1px dotted #555555;
border-top: 1px dotted #DDDDDD;
margin-left: 10px;
min-height: 100px;
padding: 15px 0 20px;
}
"""
conversion_options = {
'comments' : description
,'tags' : category
,'language' : language
,'publisher': publisher
}
remove_tags = [dict(name=['form','iframe','embed','object','link','base','table','meta'])]
keep_only_tags = [dict(attrs={'class':['post-title','post-author','entry','postmetadata alt','commentlist']})]
feeds = [(u'Articles', u'http://pressthink.org/feed/')]
def preprocess_html(self, soup):
for item in soup.findAll(style=True):
del item['style']
for item in soup.findAll('img', alt=False):
item['alt'] = 'image'
for alink in soup.findAll('a'):
if alink.string is not None:
tstr = alink.string
alink.replaceWith(tstr)
return soup

View File

@ -17,8 +17,8 @@ class SmithsonianMagazine(BasicNewsRecipe):
remove_tags = [ remove_tags = [
dict(name='iframe'), dict(name='iframe'),
dict(name='div', attrs={'class':'article_sidebar_border'}), dict(name='div', attrs={'class':'article_sidebar_border'}),
dict(name='div', attrs={'id':['article_sidebar_border', 'most-popular_large']}), dict(name='div', attrs={'id':['article_sidebar_border', 'most-popular_large', 'most-popular-body_large']}),
#dict(name='ul', attrs={'class':'article-tools'}), ##dict(name='ul', attrs={'class':'article-tools'}),
dict(name='ul', attrs={'class':'cat-breadcrumb col three last'}), dict(name='ul', attrs={'class':'cat-breadcrumb col three last'}),
] ]
@ -37,16 +37,16 @@ class SmithsonianMagazine(BasicNewsRecipe):
] ]
def preprocess_html(self, soup): def preprocess_html(self, soup):
story = soup.find(name='div', attrs={'id':'article-left'}) story = soup.find(name='div', attrs={'id':'article-body'})
#td = heading.findParent(name='td') ##td = heading.findParent(name='td')
#td.extract() ##td.extract()
soup = BeautifulSoup('<html><head><title>t</title></head><body></body></html>') soup = BeautifulSoup('<html><head><title>t</title></head><body></body></html>')
body = soup.find(name='body') body = soup.find(name='body')
body.insert(0, story) body.insert(0, story)
return soup return soup
def postprocess_html(self, soup, first): #def postprocess_html(self, soup, first):
for p in soup.findAll(id='articlePaginationWrapper'): p.extract() #for p in soup.findAll(id='articlePaginationWrapper'): p.extract()
if not first: #if not first:
for div in soup.findAll(id='article-head'): div.extract() #for div in soup.findAll(id='article-head'): div.extract()
return soup #return soup

View File

@ -1,5 +1,5 @@
from calibre.web.feeds.recipes import BasicNewsRecipe from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup #from calibre.ebooks.BeautifulSoup import BeautifulSoup
from urllib import quote from urllib import quote
class SportsIllustratedRecipe(BasicNewsRecipe) : class SportsIllustratedRecipe(BasicNewsRecipe) :
@ -91,7 +91,7 @@ class SportsIllustratedRecipe(BasicNewsRecipe) :
# expire : no idea what value to use # expire : no idea what value to use
# All this comes from the Javascript function that redirects to the print version. It's called PT() and is defined in the file 48.js # All this comes from the Javascript function that redirects to the print version. It's called PT() and is defined in the file 48.js
def preprocess_html(self, soup): '''def preprocess_html(self, soup):
header = soup.find('div', attrs = {'class' : 'siv_artheader'}) header = soup.find('div', attrs = {'class' : 'siv_artheader'})
homeMadeSoup = BeautifulSoup('<html><head></head><body></body></html>') homeMadeSoup = BeautifulSoup('<html><head></head><body></body></html>')
body = homeMadeSoup.body body = homeMadeSoup.body
@ -115,4 +115,5 @@ class SportsIllustratedRecipe(BasicNewsRecipe) :
body.append(para) body.append(para)
return homeMadeSoup return homeMadeSoup
'''

View File

@ -0,0 +1,115 @@
__license__ = 'GPL v3'
__copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>'
'''
www.thesundaytimes.co.uk
'''
import urllib
from calibre.web.feeds.news import BasicNewsRecipe
class TimesOnline(BasicNewsRecipe):
title = 'The Sunday Times UK'
__author__ = 'Darko Miletic'
description = 'news from United Kingdom and World'
language = 'en_GB'
publisher = 'Times Newspapers Ltd'
category = 'news, politics, UK'
oldest_article = 3
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
encoding = 'utf-8'
delay = 1
needs_subscription = True
publication_type = 'newspaper'
masthead_url = 'http://www.thesundaytimes.co.uk/sto/public/images/logos/logo-home.gif'
INDEX = 'http://www.thesundaytimes.co.uk'
PREFIX = u'http://www.thesundaytimes.co.uk/sto/'
extra_css = """
.author-name,.authorName{font-style: italic}
.published-date,.multi-position-photo-text{font-family: Arial,Helvetica,sans-serif;
font-size: small; color: gray;
display:block; margin-bottom: 0.5em}
body{font-family: Georgia,"Times New Roman",Times,serif}
"""
conversion_options = {
'comment' : description
, 'tags' : category
, 'publisher' : publisher
, 'language' : language
}
def get_browser(self):
br = BasicNewsRecipe.get_browser()
br.open('http://www.timesplus.co.uk/tto/news/?login=false&url=http://www.thesundaytimes.co.uk/sto/')
if self.username is not None and self.password is not None:
data = urllib.urlencode({ 'userName':self.username
,'password':self.password
,'keepMeLoggedIn':'false'
})
br.open('https://www.timesplus.co.uk/iam/app/authenticate',data)
return br
remove_tags = [
dict(name=['object','link','iframe','base','meta'])
,dict(attrs={'class':'tools comments-parent' })
]
remove_attributes=['lang']
keep_only_tags = [
dict(attrs={'class':'standard-content'})
,dict(attrs={'class':'f-author'})
,dict(attrs={'id':'bodycopy'})
]
remove_tags_after=dict(attrs={'class':'tools_border'})
feeds = [
(u'UK News' , PREFIX + u'news/uk_news/' )
,(u'World' , PREFIX + u'news/world_news/' )
,(u'Politics' , PREFIX + u'news/Politics/' )
,(u'Focus' , PREFIX + u'news/focus/' )
,(u'Insight' , PREFIX + u'news/insight/' )
,(u'Ireland' , PREFIX + u'news/ireland/' )
,(u'Columns' , PREFIX + u'comment/columns/' )
,(u'Arts' , PREFIX + u'culture/arts/' )
,(u'Books' , PREFIX + u'culture/books/' )
,(u'Film and TV' , PREFIX + u'culture/film_and_tv/' )
,(u'Sport' , PREFIX + u'sport/' )
,(u'Business' , PREFIX + u'business' )
,(u'Money' , PREFIX + u'business/money/' )
,(u'Style' , PREFIX + u'style/' )
,(u'Travel' , PREFIX + u'travel/' )
,(u'Clarkson' , PREFIX + u'ingear/clarkson/' )
,(u'Cars' , PREFIX + u'ingear/cars/' )
,(u'Bikes' , PREFIX + u'ingear/2_Wheels/' )
,(u'Tech' , PREFIX + u'ingear/Tech___Games/' )
,(u'Magazine' , PREFIX + u'Magazine/' )
]
def preprocess_html(self, soup):
for item in soup.findAll(style=True):
del item['style']
return self.adeify_images(soup)
def parse_index(self):
totalfeeds = []
lfeeds = self.get_feeds()
for feedobj in lfeeds:
feedtitle, feedurl = feedobj
self.report_progress(0, _('Fetching feed')+' %s...'%(feedtitle if feedtitle else feedurl))
articles = []
soup = self.index_to_soup(feedurl)
for atag in soup.findAll('a',href=True):
parentName = atag.parent.name
title = self.tag_to_string(atag).strip()
if (parentName == 'h2' or parentName == 'h3') and title is not None and title != '':
url = self.INDEX + atag['href']
articles.append({
'title' :title
,'date' :''
,'url' :url
,'description':''
})
totalfeeds.append((feedtitle, articles))
return totalfeeds

View File

@ -35,7 +35,6 @@ class TechnologyReview(BasicNewsRecipe):
def get_article_url(self, article): def get_article_url(self, article):
return article.get('guid', article.get('id', None)) return article.get('guid', article.get('id', None))
def print_version(self, url): def print_version(self, url):
baseurl='http://www.technologyreview.com/printer_friendly_article.aspx?id=' baseurl='http://www.technologyreview.com/printer_friendly_article.aspx?id='
split1 = string.split(url,"/") split1 = string.split(url,"/")
@ -43,3 +42,25 @@ class TechnologyReview(BasicNewsRecipe):
split2= string.split(xxx,"/") split2= string.split(xxx,"/")
s = baseurl + split2[0] s = baseurl + split2[0]
return s return s
def postprocess_html(self,soup, True):
#remove picture
headerhtml = soup.find(True, {'class':'header'})
headerhtml.replaceWith("")
#remove close button
closehtml = soup.find(True, {'class':'close'})
closehtml.replaceWith("")
#remove banner advertisement
bannerhtml = soup.find(True, {'class':'bannerad'})
bannerhtml.replaceWith("")
#thanks kiklop74! This code removes all links from the text
for alink in soup.findAll('a'):
if alink.string is not None:
tstr = alink.string
alink.replaceWith(tstr)
return soup

View File

@ -0,0 +1,25 @@
from calibre.web.feeds.news import BasicNewsRecipe
class TriCityHeraldRecipe(BasicNewsRecipe):
title = u'Tri-City Herald'
description = 'The Tri-City Herald Mid-Columbia.'
language = 'en'
__author__ = 'Laura Gjovaag'
oldest_article = 1.5
max_articles_per_feed = 100
no_stylesheets = True
remove_javascript = True
keep_only_tags = [
dict(name='div', attrs={'id':'story_header'}),
dict(name='img', attrs={'class':'imageCycle'}),
dict(name='div', attrs={'id':['cycleImageCaption', 'story_body']})
]
remove_tags = [
dict(name='div', attrs={'id':'story_mlt'}),
dict(name='a', attrs={'id':'commentCount'}),
dict(name=['script', 'noscript', 'style'])]
extra_css = 'h1{font: bold 140%;} #cycleImageCaption{font: monospace 60%}'
feeds = [
(u'Tri-City Herald Mid-Columbia', u'http://www.tri-cityherald.com/901/index.rss')
]

View File

@ -0,0 +1,80 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2011, Miroslav Vasko zemiak@gmail.com'
'''
.tyzden, a weekly news magazine (a week old issue)
'''
from calibre import strftime
from calibre.web.feeds.news import BasicNewsRecipe
from datetime import date
import re
class TyzdenRecipe(BasicNewsRecipe):
__license__ = 'GPL v3'
__author__ = 'zemiak'
language = 'sk'
version = 1
publisher = u'www.tyzden.sk'
category = u'Magazine'
description = u'A conservative weekly magazine. The latest free issue'
today = date.today()
iso = today.isocalendar()
year = iso[0]
weeknum = iso[1]
if (weeknum > 1):
weeknum -= 1
title = u'tyzden'
base_url_path = 'http://www.tyzden.sk/casopis/' + str(year) + '/' + str(weeknum)
base_url = base_url_path + '.html'
oldest_article = 20
max_articles_per_feed = 100
remove_javascript = True
use_embedded_content = False
no_stylesheets = True
keep_only_tags = []
keep_only_tags.append(dict(name = 'h1'))
keep_only_tags.append(dict(name = 'div', attrs = {'class': 'text_area top_nofoto'}))
keep_only_tags.append(dict(name = 'div', attrs = {'class': 'text_block'}))
remove_tags_after = [dict(name = 'div', attrs = {'class': 'text_block'})]
def find_sections(self):
soup = self.index_to_soup(self.base_url)
# find cover pic
imgdiv = soup.find('div', attrs = {'class': 'foto'})
if imgdiv is not None:
img = imgdiv.find('img')
if img is not None:
self.cover_url = 'http://www.tyzden.sk/' + img['src']
# end find cover pic
for s in soup.findAll('a', attrs={'href': re.compile(r'rubrika/.*')}):
yield (self.tag_to_string(s), s)
def find_articles(self, soup):
for art in soup.findAllNext('a'):
if (not art['href'].startswith('casopis/')):
break;
url = art['href']
title = self.tag_to_string(art)
yield {
'title': title, 'url':self.base_url_path + '/' + url, 'description':title,
'date' : strftime('%a, %d %b'),
}
def parse_index(self):
feeds = []
for title, soup in self.find_sections():
feeds.append((title, list(self.find_articles(soup))))
return feeds

View File

@ -1,4 +1,3 @@
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai # vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import with_statement from __future__ import with_statement
@ -29,13 +28,16 @@ class AdvancedUserRecipe1249039563(BasicNewsRecipe):
language = 'nl' language = 'nl'
extra_css = ''' extra_css = '''
body{font-family:Arial,Helvetica,sans-serif; font-size:small;} body{font-family:Arial,Helvetica,sans-serif;font-size:small;}
h1{font-size:large;} h1{font-size:large;}
''' '''
''' '''
Change Log: Change Log:
Date: 10/10/10 - Modified code to include obfuscated to get the print version Date: 10/10/10 - Modified code to include obfuscated to get the print version
Author: Tony Stegall Author: Tony Stegall
Date: 01/01/11 - Modified for better results around December/January.
Author: Martin Tarenskeen
''' '''
####################################################################################################### #######################################################################################################
temp_files = [] temp_files = []
@ -48,11 +50,17 @@ class AdvancedUserRecipe1249039563(BasicNewsRecipe):
year = date.today().year year = date.today().year
try: try:
response = br.follow_link(url_regex='.*?(%d)(\\/)(article)(\\/)(print)(\\/)'%year, nr = 0) response = br.follow_link(url_regex='.*?(%d)(\\/)(article)(\\/)(print)(\\/)'%year, nr = 0)
html = response.read() html = response.read()
except: except:
response = br.open(url) year = year-1
html = response.read() try:
response = br.follow_link(url_regex='.*?(%d)(\\/)(article)(\\/)(print)(\\/)'%year, nr = 0)
html = response.read()
except:
response = br.open(url)
html = response.read()
self.temp_files.append(PersistentTemporaryFile('_fa.html')) self.temp_files.append(PersistentTemporaryFile('_fa.html'))
self.temp_files[-1].write(html) self.temp_files[-1].write(html)
@ -76,10 +84,3 @@ class AdvancedUserRecipe1249039563(BasicNewsRecipe):
(u'Cultuur', u'http://www.volkskrant.nl/rss/kunst.rss'), (u'Cultuur', u'http://www.volkskrant.nl/rss/kunst.rss'),
(u'Gezondheid & Wetenschap', u'http://www.volkskrant.nl/rss/wetenschap.rss'), (u'Gezondheid & Wetenschap', u'http://www.volkskrant.nl/rss/wetenschap.rss'),
(u'Internet & Media', u'http://www.volkskrant.nl/rss/media.rss') ] (u'Internet & Media', u'http://www.volkskrant.nl/rss/media.rss') ]
'''
example for formating
'''
# original url: http://www.volkskrant.nl/vk/nl/2668/Buitenland/article/detail/1031493/2010/10/10/Noord-Korea-ziet-nieuwe-leider.dhtml
# print url : http://www.volkskrant.nl/vk/nl/2668/2010/article/print/detail/1031493/Noord-Korea-ziet-nieuwe-leider.dhtml

View File

@ -0,0 +1,44 @@
# -*- coding: utf-8 -*-
from calibre.web.feeds.news import BasicNewsRecipe
class AdvancedUserRecipe1283848012(BasicNewsRecipe):
description = 'The WallaNews.'
cover_url = 'http://ftp5.bizportal.co.il/web/giflib/news/rsPhoto/sz_5/rsz_220_220_logo_walla.gif'
title = u'Walla'
language = 'he'
__author__ = 'marbs'
extra_css='img {max-width:100%;} body{direction: rtl;},title{direction: rtl; } ,article_description{direction: rtl; }, a.article{direction: rtl; } ,calibre_feed_description{direction: rtl; }'
simultaneous_downloads = 5
# remove_javascript = True
timefmt = '[%a, %d %b, %Y]'
oldest_article = 1
max_articles_per_feed = 100
# remove_attributes = ['width']
keep_only_tags =dict(name='div', attrs={'class':'wp-0-b w3'})
remove_tags = [dict(name='div', attrs={'class':'tagsContainer'})]
max_articles_per_feed = 100
# preprocess_regexps = [
# (re.compile(r'<p>&nbsp;</p>', re.DOTALL|re.IGNORECASE), lambda match: '')
# ]
feeds = [(u'חדשות', u'http://rss.walla.co.il/?w=/1/0/1/@rss'),
(u'עסקים', u'http://rss.walla.co.il/?w=/2/3/1/@rss'),
(u'תרבות', u'http://rss.walla.co.il/?w=/4/249/1/@rss'),
(u'בריאות', u'http://rss.walla.co.il/?w=/5/18/1/@rss'),
(u'TECH', u'http://rss.walla.co.il/?w=/6/4/1/@rss'),
(u'אסטרולוגיה', u'http://rss.walla.co.il/?w=/8/3307/1/@rss'),
(u'בעלי חיים', u'http://rss.walla.co.il/?w=/59/5703/1/@rss'),
(u'רכב', u'http://rss.walla.co.il/?w=/31/4700/1/@rss'),
(u'סלבס', u'http://rss.walla.co.il/?w=/22/3600/1/@rss'),
(u'אוכל', u'http://rss.walla.co.il/?w=/9/903/1/@rss'),
(u'אופנה', u'http://rss.walla.co.il/?w=/24/2120/1/@rss'),
(u'ברנזה', u'http://rss.walla.co.il/?w=/27/3900/1/@rss'),
(u'ZONE', u'http://rss.walla.co.il/?w=/18/500/1/@rss'),
(u'ספורט', u'http://rss.walla.co.il/?w=/3/7/1/@rss')]
def print_version(self, url):
print_url = url + '/@@/item/printer'
return print_url

View File

@ -0,0 +1,29 @@
from calibre.web.feeds.news import BasicNewsRecipe
class AdvancedUserRecipe1294938721(BasicNewsRecipe):
title = u'Wichita Eagle'
language = 'en'
__author__ = 'Jason Cameron'
description = 'Daily news from the Wichita Eagle'
oldest_article = 1
max_articles_per_feed = 30
keep_only_tags = [dict(name='div', attrs={'id':'wide'})]
feeds = [
(u'Local News',
u'http://www.kansas.com/news/local/index.rss'),
(u'National News',
u'http://www.kansas.com/news/nation-world/index.rss'),
(u'Sports',
u'http://www.kansas.com/sports/index.rss'),
(u'Opinion',
u'http://www.kansas.com/opinion/index.rss'),
(u'Life',
u'http://www.kansas.com/living/index.rss'),
(u'Entertainment',
u'http://www.kansas.com/entertainment/index.rss')
]
def print_version(self, url):
urlparts = url.split('/')
newadd = urlparts[5]+'/v-print'
return url.replace(url, newadd.join(url.split(urlparts[5])))

View File

@ -2,8 +2,10 @@
__license__ = 'GPL v3' __license__ = 'GPL v3'
__docformat__ = 'restructuredtext en' __docformat__ = 'restructuredtext en'
import re
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.chardet import xml_to_unicode
class Wired_Daily(BasicNewsRecipe): class Wired_Daily(BasicNewsRecipe):
@ -15,30 +17,43 @@ class Wired_Daily(BasicNewsRecipe):
no_stylesheets = True no_stylesheets = True
preprocess_regexps = [(re.compile(r'<head.*</head>', re.DOTALL), lambda m:
'<head></head>')]
remove_tags_before = dict(name='div', id='content') remove_tags_before = dict(name='div', id='content')
remove_tags = [dict(id=['social_tools', 'outerWrapper', 'sidebar', remove_tags = [dict(id=['header', 'commenting_module', 'post_nav',
'footer', 'advertisement', 'blog_subscription_unit', 'social_tools', 'sidebar', 'footer', 'social_wishlist', 'pgwidget',
'brightcove_component']), 'outerWrapper', 'inf_widget']),
{'class':'entryActions'}, {'class':['entryActions', 'advertisement', 'entryTags']},
dict(name=['noscript', 'script'])] dict(name=['noscript', 'script']),
dict(name='h4', attrs={'class':re.compile(r'rat\d+')}),
{'class':lambda x: x and x.startswith('contentjump')},
dict(name='li', attrs={'class':['entryCategories', 'entryEdit']})]
feeds = [ feeds = [
('Top News', 'http://feeds.wired.com/wired/index'), ('Top News', 'http://feeds.wired.com/wired/index'),
('Culture', 'http://feeds.wired.com/wired/culture'), ('Product Reviews',
('Software', 'http://feeds.wired.com/wired/software'), 'http://www.wired.com/reviews/feeds/latestProductsRss'),
('Mac', 'http://feeds.feedburner.com/cultofmac/bFow'), ('Autopia', 'http://www.wired.com/autopia/feed/'),
('Gadgets', 'http://feeds.wired.com/wired/gadgets'), ('Danger Room', 'http://www.wired.com/dangerroom/feed/'),
('Cars', 'http://feeds.wired.com/wired/cars'), ('Epicenter', 'http://www.wired.com/epicenter/feed/'),
('Entertainment', 'http://feeds.wired.com/wired/entertainment'), ('Gadget Lab', 'http://www.wired.com/gadgetlab/feed/'),
('Gaming', 'http://feeds.wired.com/wired/gaming'), ('Geek Dad', 'http://www.wired.com/geekdad/feed/'),
('Science', 'http://feeds.wired.com/wired/science'), ('Playbook', 'http://www.wired.com/playbook/feed/'),
('Med Tech', 'http://feeds.wired.com/wired/medtech'), ('Rawfile', 'http://www.wired.com/rawfile/feed/'),
('Politics', 'http://feeds.wired.com/wired/politics'), ('This Day in Tech', 'http://www.wired.com/thisdayintech/feed/'),
('Tech Biz', 'http://feeds.wired.com/wired/techbiz'), ('Threat Level', 'http://www.wired.com/threatlevel/feed/'),
('Commentary', 'http://feeds.wired.com/wired/commentary'), ('Underwire', 'http://www.wired.com/underwire/feed/'),
('Web Monkey', 'http://www.webmonkey.com/feed/'),
('Science', 'http://www.wired.com/wiredscience/feed/'),
] ]
def populate_article_metadata(self, article, soup, first):
if article.text_summary:
article.text_summary = xml_to_unicode(article.text_summary,
resolve_entities=True)[0]
def print_version(self, url): def print_version(self, url):
return url.replace('http://www.wired.com/', 'http://www.wired.com/print/') return url + '/all/1'

View File

@ -0,0 +1,21 @@
from calibre.web.feeds.news import BasicNewsRecipe
class YakimaHeraldRepublicRecipe(BasicNewsRecipe):
title = u'Yakima Herald-Republic'
description = 'The Yakima Herald-Republic.'
language = 'en'
__author__ = 'Laura Gjovaag'
oldest_article = 1.5
max_articles_per_feed = 100
no_stylesheets = True
remove_javascript = True
keep_only_tags = [
dict(name='div', attrs={'id':['searchleft', 'headline_credit']}),
dict(name='div', attrs={'class':['photo', 'cauthor', 'photocredit']}),
dict(name='div', attrs={'id':['content_body', 'footerleft']})
]
extra_css = '.cauthor {font: monospace 60%;} .photocredit {font: monospace 60%}'
feeds = [
(u'Yakima Herald Online', u'http://feeds.feedburner.com/yhronlinenews'),
]

View File

@ -0,0 +1,33 @@
__license__ = 'GPL v3'
__copyright__ = '2011, Darko Miletic <darko.miletic at gmail.com>'
'''
www.zerohedge.com
'''
from calibre.web.feeds.recipes import BasicNewsRecipe
class ZeroHedge(BasicNewsRecipe):
title = 'Zero Hedge'
__author__ = 'Darko Miletic'
description = 'On a long enough timeline the survival rate for everyone drops to zero'
oldest_article = 10
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = True
encoding = 'utf8'
publisher = 'zero hedge'
category = 'news, USA, world, economy, politics'
language = 'en'
masthead_url = 'http://www.zerohedge.com/themes/newsflash/logo.png'
publication_type = 'blog'
extra_css = 'body{ font-family: sans-serif }'
conversion_options = {
'comments' : description
,'tags' : category
,'language' : language
,'publisher': publisher
}
feeds = [(u'Articles', u'http://feeds.feedburner.com/zerohedge/feed')]

View File

@ -0,0 +1,28 @@
{
"contains": "def evaluate(self, formatter, kwargs, mi, locals,\n val, test, value_if_present, value_if_not):\n if re.search(test, val):\n return value_if_present\n else:\n return value_if_not\n",
"divide": "def evaluate(self, formatter, kwargs, mi, locals, x, y):\n x = float(x if x else 0)\n y = float(y if y else 0)\n return unicode(x / y)\n",
"uppercase": "def evaluate(self, formatter, kwargs, mi, locals, val):\n return val.upper()\n",
"strcat": "def evaluate(self, formatter, kwargs, mi, locals, *args):\n i = 0\n res = ''\n for i in range(0, len(args)):\n res += args[i]\n return res\n",
"substr": "def evaluate(self, formatter, kwargs, mi, locals, str_, start_, end_):\n return str_[int(start_): len(str_) if int(end_) == 0 else int(end_)]\n",
"ifempty": "def evaluate(self, formatter, kwargs, mi, locals, val, value_if_empty):\n if val:\n return val\n else:\n return value_if_empty\n",
"field": "def evaluate(self, formatter, kwargs, mi, locals, name):\n return formatter.get_value(name, [], kwargs)\n",
"capitalize": "def evaluate(self, formatter, kwargs, mi, locals, val):\n return capitalize(val)\n",
"list_item": "def evaluate(self, formatter, kwargs, mi, locals, val, index, sep):\n if not val:\n return ''\n index = int(index)\n val = val.split(sep)\n try:\n return val[index]\n except:\n return ''\n",
"shorten": "def evaluate(self, formatter, kwargs, mi, locals,\n val, leading, center_string, trailing):\n l = max(0, int(leading))\n t = max(0, int(trailing))\n if len(val) > l + len(center_string) + t:\n return val[0:l] + center_string + ('' if t == 0 else val[-t:])\n else:\n return val\n",
"re": "def evaluate(self, formatter, kwargs, mi, locals, val, pattern, replacement):\n return re.sub(pattern, replacement, val)\n",
"add": "def evaluate(self, formatter, kwargs, mi, locals, x, y):\n x = float(x if x else 0)\n y = float(y if y else 0)\n return unicode(x + y)\n",
"lookup": "def evaluate(self, formatter, kwargs, mi, locals, val, *args):\n if len(args) == 2: # here for backwards compatibility\n if val:\n return formatter.vformat('{'+args[0].strip()+'}', [], kwargs)\n else:\n return formatter.vformat('{'+args[1].strip()+'}', [], kwargs)\n if (len(args) % 2) != 1:\n raise ValueError(_('lookup requires either 2 or an odd number of arguments'))\n i = 0\n while i < len(args):\n if i + 1 >= len(args):\n return formatter.vformat('{' + args[i].strip() + '}', [], kwargs)\n if re.search(args[i], val):\n return formatter.vformat('{'+args[i+1].strip() + '}', [], kwargs)\n i += 2\n",
"template": "def evaluate(self, formatter, kwargs, mi, locals, template):\n template = template.replace('[[', '{').replace(']]', '}')\n return formatter.safe_format(template, kwargs, 'TEMPLATE', mi)\n",
"print": "def evaluate(self, formatter, kwargs, mi, locals, *args):\n print args\n return None\n",
"titlecase": "def evaluate(self, formatter, kwargs, mi, locals, val):\n return titlecase(val)\n",
"test": "def evaluate(self, formatter, kwargs, mi, locals, val, value_if_set, value_not_set):\n if val:\n return value_if_set\n else:\n return value_not_set\n",
"eval": "def evaluate(self, formatter, kwargs, mi, locals, template):\n from formatter import eval_formatter\n template = template.replace('[[', '{').replace(']]', '}')\n return eval_formatter.safe_format(template, locals, 'EVAL', None)\n",
"multiply": "def evaluate(self, formatter, kwargs, mi, locals, x, y):\n x = float(x if x else 0)\n y = float(y if y else 0)\n return unicode(x * y)\n",
"subtract": "def evaluate(self, formatter, kwargs, mi, locals, x, y):\n x = float(x if x else 0)\n y = float(y if y else 0)\n return unicode(x - y)\n",
"count": "def evaluate(self, formatter, kwargs, mi, locals, val, sep):\n return unicode(len(val.split(sep)))\n",
"lowercase": "def evaluate(self, formatter, kwargs, mi, locals, val):\n return val.lower()\n",
"assign": "def evaluate(self, formatter, kwargs, mi, locals, target, value):\n locals[target] = value\n return value\n",
"switch": "def evaluate(self, formatter, kwargs, mi, locals, val, *args):\n if (len(args) % 2) != 1:\n raise ValueError(_('switch requires an odd number of arguments'))\n i = 0\n while i < len(args):\n if i + 1 >= len(args):\n return args[i]\n if re.search(args[i], val):\n return args[i+1]\n i += 2\n",
"strcmp": "def evaluate(self, formatter, kwargs, mi, locals, x, y, lt, eq, gt):\n v = strcmp(x, y)\n if v < 0:\n return lt\n if v == 0:\n return eq\n return gt\n",
"cmp": "def evaluate(self, formatter, kwargs, mi, locals, x, y, lt, eq, gt):\n x = float(x if x else 0)\n y = float(y if y else 0)\n if x < y:\n return lt\n if x == y:\n return eq\n return gt\n"
}

View File

@ -287,7 +287,7 @@
<xsl:value-of select="count(preceding::rtf:footnote) + 1"/> <xsl:value-of select="count(preceding::rtf:footnote) + 1"/>
<xsl:text>]</xsl:text> <xsl:text>]</xsl:text>
</xsl:when> </xsl:when>
<xsl:when test="(@superscript = 'true')"> <xsl:when test="(@superscript)">
<xsl:element name="sup"> <xsl:element name="sup">
<xsl:element name="span"> <xsl:element name="span">
<xsl:attribute name="class"> <xsl:attribute name="class">
@ -297,7 +297,7 @@
</xsl:element> </xsl:element>
</xsl:element> </xsl:element>
</xsl:when> </xsl:when>
<xsl:when test="(@underscript = 'true')"> <xsl:when test="(@underscript or @subscript)">
<xsl:element name="sub"> <xsl:element name="sub">
<xsl:element name="span"> <xsl:element name="span">
<xsl:attribute name="class"> <xsl:attribute name="class">

View File

@ -41,6 +41,7 @@ function scroll_to_bookmark(bookmark) {
$.scrollTo($(bm[0]), 1000, $.scrollTo($(bm[0]), 1000,
{ {
over:ratio, over:ratio,
axis: 'y', // Do not scroll in the x direction
onAfter:function(){window.py_bridge.animated_scroll_done()} onAfter:function(){window.py_bridge.animated_scroll_done()}
} }
); );

View File

@ -117,11 +117,10 @@ if iswindows:
poppler_inc_dirs = consolidate('POPPLER_INC_DIR', poppler_inc_dirs = consolidate('POPPLER_INC_DIR',
r'%s\poppler;%s'%(sw_inc_dir, sw_inc_dir)) r'%s\poppler;%s'%(sw_inc_dir, sw_inc_dir))
popplerqt4_inc_dirs = poppler_inc_dirs + [poppler_inc_dirs[1]+r'\qt4']
poppler_lib_dirs = consolidate('POPPLER_LIB_DIR', sw_lib_dir) poppler_lib_dirs = consolidate('POPPLER_LIB_DIR', sw_lib_dir)
popplerqt4_lib_dirs = poppler_lib_dirs popplerqt4_lib_dirs = poppler_lib_dirs
poppler_libs = ['poppler'] poppler_libs = ['poppler']
magick_inc_dirs = [os.path.join(prefix, 'build', 'ImageMagick-6.5.6')] magick_inc_dirs = [os.path.join(prefix, 'build', 'ImageMagick-6.6.6')]
magick_lib_dirs = [os.path.join(magick_inc_dirs[0], 'VisualMagick', 'lib')] magick_lib_dirs = [os.path.join(magick_inc_dirs[0], 'VisualMagick', 'lib')]
magick_libs = ['CORE_RL_wand_', 'CORE_RL_magick_'] magick_libs = ['CORE_RL_wand_', 'CORE_RL_magick_']
podofo_inc = os.path.join(sw_inc_dir, 'podofo') podofo_inc = os.path.join(sw_inc_dir, 'podofo')
@ -131,7 +130,6 @@ elif isosx:
fc_lib = '/sw/lib' fc_lib = '/sw/lib'
poppler_inc_dirs = consolidate('POPPLER_INC_DIR', poppler_inc_dirs = consolidate('POPPLER_INC_DIR',
'/sw/build/poppler-0.14.5/poppler:/sw/build/poppler-0.14.5') '/sw/build/poppler-0.14.5/poppler:/sw/build/poppler-0.14.5')
popplerqt4_inc_dirs = poppler_inc_dirs + [poppler_inc_dirs[0]+'/qt4']
poppler_lib_dirs = consolidate('POPPLER_LIB_DIR', poppler_lib_dirs = consolidate('POPPLER_LIB_DIR',
'/sw/lib') '/sw/lib')
poppler_libs = ['poppler'] poppler_libs = ['poppler']
@ -150,9 +148,6 @@ else:
# Include directories # Include directories
poppler_inc_dirs = pkgconfig_include_dirs('poppler', poppler_inc_dirs = pkgconfig_include_dirs('poppler',
'POPPLER_INC_DIR', '/usr/include/poppler') 'POPPLER_INC_DIR', '/usr/include/poppler')
popplerqt4_inc_dirs = pkgconfig_include_dirs('poppler-qt4', '', '')
if not popplerqt4_inc_dirs:
popplerqt4_inc_dirs = poppler_inc_dirs + [poppler_inc_dirs[0]+'/qt4']
png_inc_dirs = pkgconfig_include_dirs('libpng', 'PNG_INC_DIR', png_inc_dirs = pkgconfig_include_dirs('libpng', 'PNG_INC_DIR',
'/usr/include') '/usr/include')
magick_inc_dirs = pkgconfig_include_dirs('MagickWand', 'MAGICK_INC', '/usr/include/ImageMagick') magick_inc_dirs = pkgconfig_include_dirs('MagickWand', 'MAGICK_INC', '/usr/include/ImageMagick')
@ -187,20 +182,17 @@ if not poppler_inc_dirs or not os.path.exists(
poppler_error = \ poppler_error = \
('Poppler not found on your system. Various PDF related', ('Poppler not found on your system. Various PDF related',
' functionality will not work. Use the POPPLER_INC_DIR and', ' functionality will not work. Use the POPPLER_INC_DIR and',
' POPPLER_LIB_DIR environment variables.') ' POPPLER_LIB_DIR environment variables. calibre requires '
' the poppler XPDF headers. If your distro does not '
popplerqt4_error = None ' include them you will have to re-compile poppler '
if not popplerqt4_inc_dirs or not os.path.exists( ' by hand with --enable-xpdf-headers')
os.path.join(popplerqt4_inc_dirs[-1], 'poppler-qt4.h')):
popplerqt4_error = \
('Poppler Qt4 bindings not found on your system.')
magick_error = None magick_error = None
if not magick_inc_dirs or not os.path.exists(os.path.join(magick_inc_dirs[0], if not magick_inc_dirs or not os.path.exists(os.path.join(magick_inc_dirs[0],
'wand')): 'wand')):
magick_error = ('ImageMagick not found on your system. ' magick_error = ('ImageMagick not found on your system. '
'Try setting the environment variables MAGICK_INC ' 'Try setting the environment variables MAGICK_INC '
'and MAGICK_LIB to help calibre locate the inclue and libbrary ' 'and MAGICK_LIB to help calibre locate the include and library '
'files.') 'files.')
podofo_lib = os.environ.get('PODOFO_LIB_DIR', podofo_lib) podofo_lib = os.environ.get('PODOFO_LIB_DIR', podofo_lib)

View File

@ -612,8 +612,13 @@ class Py2App(object):
dmg = os.path.join(destdir, volname+'.dmg') dmg = os.path.join(destdir, volname+'.dmg')
if os.path.exists(dmg): if os.path.exists(dmg):
os.unlink(dmg) os.unlink(dmg)
subprocess.check_call(['/usr/bin/hdiutil', 'create', '-srcfolder', os.path.abspath(d), tdir = tempfile.mkdtemp()
shutil.copytree(d, os.path.join(tdir, os.path.basename(d)),
symlinks=True)
os.symlink('/Applications', os.path.join(tdir, 'Applications'))
subprocess.check_call(['/usr/bin/hdiutil', 'create', '-srcfolder', tdir,
'-volname', volname, '-format', format, dmg]) '-volname', volname, '-format', format, dmg])
shutil.rmtree(tdir)
if internet_enable: if internet_enable:
subprocess.check_call(['/usr/bin/hdiutil', 'internet-enable', '-yes', dmg]) subprocess.check_call(['/usr/bin/hdiutil', 'internet-enable', '-yes', dmg])
size = os.stat(dmg).st_size/(1024*1024.) size = os.stat(dmg).st_size/(1024*1024.)

View File

@ -18,7 +18,7 @@ QT_DLLS = ['Core', 'Gui', 'Network', 'Svg', 'WebKit', 'Xml', 'XmlPatterns']
LIBUSB_DIR = 'C:\\libusb' LIBUSB_DIR = 'C:\\libusb'
LIBUNRAR = 'C:\\Program Files\\UnrarDLL\\unrar.dll' LIBUNRAR = 'C:\\Program Files\\UnrarDLL\\unrar.dll'
SW = r'C:\cygwin\home\kovid\sw' SW = r'C:\cygwin\home\kovid\sw'
IMAGEMAGICK = os.path.join(SW, 'build', 'ImageMagick-6.5.6', IMAGEMAGICK = os.path.join(SW, 'build', 'ImageMagick-6.6.6',
'VisualMagick', 'bin') 'VisualMagick', 'bin')
VERSION = re.sub('[a-z]\d+', '', __version__) VERSION = re.sub('[a-z]\d+', '', __version__)

View File

@ -301,12 +301,14 @@ int projectType = MULTITHREADEDDLL;
Run configure.bat in a visual studio command prompt Run configure.bat in a visual studio command prompt
Run configure.exe generated by configure.bat
Edit magick/magick-config.h Edit magick/magick-config.h
Undefine ProvideDllMain and MAGICKCORE_X11_DELEGATE Undefine ProvideDllMain and MAGICKCORE_X11_DELEGATE
Now open VisualMagick/VisualDynamicMT.sln set to Release Now open VisualMagick/VisualDynamicMT.sln set to Release
Remove the CORE_xlib project Remove the CORE_xlib and UTIL_Imdisplay project CORE_Magick++
calibre calibre
--------- ---------

View File

@ -84,6 +84,23 @@ class Resources(Command):
cPickle.dump(complete, open(dest, 'wb'), -1) cPickle.dump(complete, open(dest, 'wb'), -1)
self.info('\tCreating template-functions.json')
dest = self.j(self.RESOURCES, 'template-functions.json')
function_dict = {}
import inspect
from calibre.utils.formatter_functions import all_builtin_functions
for obj in all_builtin_functions:
eval_func = inspect.getmembers(obj,
lambda x: inspect.ismethod(x) and x.__name__ == 'evaluate')
try:
lines = [l[4:] for l in inspect.getsourcelines(eval_func[0][1])[0]]
except:
continue
lines = ''.join(lines)
function_dict[obj.name] = lines
import json
json.dump(function_dict, open(dest, 'wb'), indent=4)
def clean(self): def clean(self):
for x in ('scripts', 'recipes', 'ebook-convert-complete'): for x in ('scripts', 'recipes', 'ebook-convert-complete'):
x = self.j(self.RESOURCES, x+'.pickle') x = self.j(self.RESOURCES, x+'.pickle')

View File

@ -6,7 +6,7 @@ __license__ = 'GPL v3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>' __copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en' __docformat__ = 'restructuredtext en'
import os, re, cStringIO, base64, httplib, subprocess, hashlib, shutil import os, re, cStringIO, base64, httplib, subprocess, hashlib, shutil, time
from subprocess import check_call from subprocess import check_call
from tempfile import NamedTemporaryFile, mkdtemp from tempfile import NamedTemporaryFile, mkdtemp
@ -160,7 +160,7 @@ class UploadToGoogleCode(Command):
return 'multipart/form-data; boundary=%s' % BOUNDARY, CRLF.join(body) return 'multipart/form-data; boundary=%s' % BOUNDARY, CRLF.join(body)
def upload(self, fname, desc, labels=[]): def upload(self, fname, desc, labels=[], retry=0):
form_fields = [('summary', desc)] form_fields = [('summary', desc)]
form_fields.extend([('label', l.strip()) for l in labels]) form_fields.extend([('label', l.strip()) for l in labels])
@ -183,6 +183,10 @@ class UploadToGoogleCode(Command):
print 'Failed to upload with code %d and reason: %s'%(resp.status, print 'Failed to upload with code %d and reason: %s'%(resp.status,
resp.reason) resp.reason)
if retry < 1:
print 'Retrying in 5 seconds....'
time.sleep(5)
return self.upload(fname, desc, labels=labels, retry=retry+1)
raise Exception('Failed to upload '+fname) raise Exception('Failed to upload '+fname)

View File

@ -254,7 +254,7 @@ def browser(honor_time=True, max_time=2, mobile_browser=False):
opener.set_handle_refresh(True, max_time=max_time, honor_time=honor_time) opener.set_handle_refresh(True, max_time=max_time, honor_time=honor_time)
opener.set_handle_robots(False) opener.set_handle_robots(False)
opener.addheaders = [('User-agent', ' Mozilla/5.0 (Windows; U; Windows CE 5.1; rv:1.8.1a3) Gecko/20060610 Minimo/0.016' if mobile_browser else \ opener.addheaders = [('User-agent', ' Mozilla/5.0 (Windows; U; Windows CE 5.1; rv:1.8.1a3) Gecko/20060610 Minimo/0.016' if mobile_browser else \
'Mozilla/5.0 (X11; U; i686 Linux; en_US; rv:1.8.0.4) Gecko/20060508 Firefox/1.5.0.4')] 'Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.2.13) Gecko/20101210 Gentoo Firefox/3.6.13')]
http_proxy = get_proxies().get('http', None) http_proxy = get_proxies().get('http', None)
if http_proxy: if http_proxy:
opener.set_proxies({'http':http_proxy}) opener.set_proxies({'http':http_proxy})
@ -459,6 +459,18 @@ def force_unicode(obj, enc=preferred_encoding):
obj = obj.decode('utf-8') obj = obj.decode('utf-8')
return obj return obj
def as_unicode(obj, enc=preferred_encoding):
if not isbytestring(obj):
try:
obj = unicode(obj)
except:
try:
obj = str(obj)
except:
obj = repr(obj)
return force_unicode(obj, enc=enc)
def human_readable(size): def human_readable(size):
""" Convert a size in bytes into a human readable form """ """ Convert a size in bytes into a human readable form """

View File

@ -2,7 +2,7 @@ __license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net' __copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
__docformat__ = 'restructuredtext en' __docformat__ = 'restructuredtext en'
__appname__ = 'calibre' __appname__ = 'calibre'
__version__ = '0.7.35' __version__ = '0.7.40'
__author__ = "Kovid Goyal <kovid@kovidgoyal.net>" __author__ = "Kovid Goyal <kovid@kovidgoyal.net>"
import re import re

View File

@ -80,6 +80,100 @@ class Plugin(object): # {{{
''' '''
pass pass
def config_widget(self):
'''
Implement this method and :meth:`save_settings` in your plugin to
use a custom configuration dialog, rather then relying on the simple
string based default customization.
This method, if implemented, must return a QWidget. The widget can have
an optional method validate() that takes no arguments and is called
immediately after the user clicks OK. Changes are applied if and only
if the method returns True.
'''
raise NotImplementedError()
def save_settings(self, config_widget):
'''
Save the settings specified by the user with config_widget.
:param config_widget: The widget returned by :meth:`config_widget`.
'''
raise NotImplementedError()
def do_user_config(self, parent=None):
'''
This method shows a configuration dialog for this plugin. It returns
True if the user clicks OK, False otherwise. The changes are
automatically applied.
'''
from PyQt4.Qt import QDialog, QDialogButtonBox, QVBoxLayout, \
QLabel, Qt, QLineEdit
from calibre.gui2 import gprefs
prefname = 'plugin config dialog:'+self.type + ':' + self.name
geom = gprefs.get(prefname, None)
config_dialog = QDialog(parent)
button_box = QDialogButtonBox(QDialogButtonBox.Ok | QDialogButtonBox.Cancel)
v = QVBoxLayout(config_dialog)
def size_dialog():
if geom is None:
config_dialog.resize(config_dialog.sizeHint())
else:
config_dialog.restoreGeometry(geom)
button_box.accepted.connect(config_dialog.accept)
button_box.rejected.connect(config_dialog.reject)
config_dialog.setWindowTitle(_('Customize') + ' ' + self.name)
try:
config_widget = self.config_widget()
except NotImplementedError:
config_widget = None
if config_widget is not None:
v.addWidget(config_widget)
v.addWidget(button_box)
size_dialog()
config_dialog.exec_()
if config_dialog.result() == QDialog.Accepted:
if hasattr(config_widget, 'validate'):
if config_widget.validate():
self.save_settings(config_widget)
else:
self.save_settings(config_widget)
else:
from calibre.customize.ui import plugin_customization, \
customize_plugin
help_text = self.customization_help(gui=True)
help_text = QLabel(help_text, config_dialog)
help_text.setWordWrap(True)
help_text.setTextInteractionFlags(Qt.LinksAccessibleByMouse
| Qt.LinksAccessibleByKeyboard)
help_text.setOpenExternalLinks(True)
v.addWidget(help_text)
sc = plugin_customization(self)
if not sc:
sc = ''
sc = sc.strip()
sc = QLineEdit(sc, config_dialog)
v.addWidget(sc)
v.addWidget(button_box)
size_dialog()
config_dialog.exec_()
if config_dialog.result() == QDialog.Accepted:
sc = unicode(sc.text()).strip()
customize_plugin(self, sc)
geom = bytearray(config_dialog.saveGeometry())
gprefs[prefname] = geom
return config_dialog.result()
def load_resources(self, names): def load_resources(self, names):
''' '''
If this plugin comes in a ZIP file (user added plugin), this method If this plugin comes in a ZIP file (user added plugin), this method
@ -307,6 +401,14 @@ class CatalogPlugin(Plugin): # {{{
#: cli_options parsed in library.cli:catalog_option_parser() #: cli_options parsed in library.cli:catalog_option_parser()
cli_options = [] cli_options = []
def _field_sorter(self, key):
'''
Custom fields sort after standard fields
'''
if key.startswith('#'):
return '~%s' % key[1:]
else:
return key
def search_sort_db(self, db, opts): def search_sort_db(self, db, opts):
@ -315,18 +417,18 @@ class CatalogPlugin(Plugin): # {{{
if opts.sort_by: if opts.sort_by:
# 2nd arg = ascending # 2nd arg = ascending
db.sort(opts.sort_by, True) db.sort(opts.sort_by, True)
return db.get_data_as_dict(ids=opts.ids) return db.get_data_as_dict(ids=opts.ids)
def get_output_fields(self, opts): def get_output_fields(self, db, opts):
# Return a list of requested fields, with opts.sort_by first # Return a list of requested fields, with opts.sort_by first
all_fields = set( all_std_fields = set(
['author_sort','authors','comments','cover','formats', ['author_sort','authors','comments','cover','formats',
'id','isbn','ondevice','pubdate','publisher','rating', 'id','isbn','ondevice','pubdate','publisher','rating',
'series_index','series','size','tags','timestamp', 'series_index','series','size','tags','timestamp',
'title','uuid']) 'title','uuid'])
all_custom_fields = set(db.custom_field_keys())
all_fields = all_std_fields.union(all_custom_fields)
fields = all_fields
if opts.fields != 'all': if opts.fields != 'all':
# Make a list from opts.fields # Make a list from opts.fields
requested_fields = set(opts.fields.split(',')) requested_fields = set(opts.fields.split(','))
@ -337,7 +439,7 @@ class CatalogPlugin(Plugin): # {{{
if not opts.connected_device['is_device_connected'] and 'ondevice' in fields: if not opts.connected_device['is_device_connected'] and 'ondevice' in fields:
fields.pop(int(fields.index('ondevice'))) fields.pop(int(fields.index('ondevice')))
fields.sort() fields = sorted(fields, key=self._field_sorter)
if opts.sort_by and opts.sort_by in fields: if opts.sort_by and opts.sort_by in fields:
fields.insert(0,fields.pop(int(fields.index(opts.sort_by)))) fields.insert(0,fields.pop(int(fields.index(opts.sort_by))))
return fields return fields

View File

@ -478,7 +478,7 @@ from calibre.devices.teclast.driver import TECLAST_K3, NEWSMY, IPAPYRUS, \
from calibre.devices.sne.driver import SNE from calibre.devices.sne.driver import SNE
from calibre.devices.misc import PALMPRE, AVANT, SWEEX, PDNOVEL, KOGAN, \ from calibre.devices.misc import PALMPRE, AVANT, SWEEX, PDNOVEL, KOGAN, \
GEMEI, VELOCITYMICRO, PDNOVEL_KOBO, Q600, LUMIREAD, ALURATEK_COLOR, \ GEMEI, VELOCITYMICRO, PDNOVEL_KOBO, Q600, LUMIREAD, ALURATEK_COLOR, \
TREKSTOR, EEEREADER TREKSTOR, EEEREADER, NEXTBOOK
from calibre.devices.folder_device.driver import FOLDER_DEVICE_FOR_CONFIG from calibre.devices.folder_device.driver import FOLDER_DEVICE_FOR_CONFIG
from calibre.devices.kobo.driver import KOBO from calibre.devices.kobo.driver import KOBO
from calibre.devices.bambook.driver import BAMBOOK from calibre.devices.bambook.driver import BAMBOOK
@ -606,6 +606,7 @@ plugins += [
BAMBOOK, BAMBOOK,
TREKSTOR, TREKSTOR,
EEEREADER, EEEREADER,
NEXTBOOK,
ITUNES, ITUNES,
] ]
plugins += [x for x in list(locals().values()) if isinstance(x, type) and \ plugins += [x for x in list(locals().values()) if isinstance(x, type) and \
@ -704,13 +705,17 @@ class ActionTweakEpub(InterfaceActionBase):
name = 'Tweak ePub' name = 'Tweak ePub'
actual_plugin = 'calibre.gui2.actions.tweak_epub:TweakEpubAction' actual_plugin = 'calibre.gui2.actions.tweak_epub:TweakEpubAction'
class ActionNextMatch(InterfaceActionBase):
name = 'Next Match'
actual_plugin = 'calibre.gui2.actions.next_match:NextMatchAction'
plugins += [ActionAdd, ActionFetchAnnotations, ActionGenerateCatalog, plugins += [ActionAdd, ActionFetchAnnotations, ActionGenerateCatalog,
ActionConvert, ActionDelete, ActionEditMetadata, ActionView, ActionConvert, ActionDelete, ActionEditMetadata, ActionView,
ActionFetchNews, ActionSaveToDisk, ActionShowBookDetails, ActionFetchNews, ActionSaveToDisk, ActionShowBookDetails,
ActionRestart, ActionOpenFolder, ActionConnectShare, ActionRestart, ActionOpenFolder, ActionConnectShare,
ActionSendToDevice, ActionHelp, ActionPreferences, ActionSimilarBooks, ActionSendToDevice, ActionHelp, ActionPreferences, ActionSimilarBooks,
ActionAddToLibrary, ActionEditCollections, ActionChooseLibrary, ActionAddToLibrary, ActionEditCollections, ActionChooseLibrary,
ActionCopyToLibrary, ActionTweakEpub] ActionCopyToLibrary, ActionTweakEpub, ActionNextMatch]
# }}} # }}}
@ -842,6 +847,17 @@ class Plugboard(PreferencesPlugin):
config_widget = 'calibre.gui2.preferences.plugboard' config_widget = 'calibre.gui2.preferences.plugboard'
description = _('Change metadata fields before saving/sending') description = _('Change metadata fields before saving/sending')
class TemplateFunctions(PreferencesPlugin):
name = 'TemplateFunctions'
icon = I('template_funcs.png')
gui_name = _('Template Functions')
category = 'Advanced'
gui_category = _('Advanced')
category_order = 5
name_order = 4
config_widget = 'calibre.gui2.preferences.template_functions'
description = _('Create your own template functions')
class Email(PreferencesPlugin): class Email(PreferencesPlugin):
name = 'Email' name = 'Email'
icon = I('mail.png') icon = I('mail.png')
@ -903,6 +919,6 @@ class Misc(PreferencesPlugin):
plugins += [LookAndFeel, Behavior, Columns, Toolbar, InputOptions, plugins += [LookAndFeel, Behavior, Columns, Toolbar, InputOptions,
CommonOptions, OutputOptions, Adding, Saving, Sending, Plugboard, CommonOptions, OutputOptions, Adding, Saving, Sending, Plugboard,
Email, Server, Plugins, Tweaks, Misc] Email, Server, Plugins, Tweaks, Misc, TemplateFunctions]
#}}} #}}}

View File

@ -439,6 +439,13 @@ class TabletOutput(iPadOutput):
screen_size = (sys.maxint, sys.maxint) screen_size = (sys.maxint, sys.maxint)
comic_screen_size = (sys.maxint, sys.maxint) comic_screen_size = (sys.maxint, sys.maxint)
class SamsungGalaxy(TabletOutput):
name = 'Samsung Galaxy'
short_name = 'galaxy'
description = _('Intended for the Samsung Galaxy and similar tablet devices with '
'a resolution of 600x1280')
screen_size = comic_screen_size = (600, 1280)
class SonyReaderOutput(OutputProfile): class SonyReaderOutput(OutputProfile):
name = 'Sony Reader' name = 'Sony Reader'
@ -617,6 +624,8 @@ class KindleDXOutput(OutputProfile):
#comic_screen_size = (741, 1022) #comic_screen_size = (741, 1022)
supports_mobi_indexing = True supports_mobi_indexing = True
periodical_date_in_title = False periodical_date_in_title = False
missing_char = u'x\u2009'
empty_ratings_char = u'\u2606'
ratings_char = u'\u2605' ratings_char = u'\u2605'
read_char = u'\u2713' read_char = u'\u2713'
mobi_ems_per_blockquote = 2.0 mobi_ems_per_blockquote = 2.0
@ -707,7 +716,7 @@ class BambookOutput(OutputProfile):
output_profiles = [OutputProfile, SonyReaderOutput, SonyReader300Output, output_profiles = [OutputProfile, SonyReaderOutput, SonyReader300Output,
SonyReader900Output, MSReaderOutput, MobipocketOutput, HanlinV3Output, SonyReader900Output, MSReaderOutput, MobipocketOutput, HanlinV3Output,
HanlinV5Output, CybookG3Output, CybookOpusOutput, KindleOutput, HanlinV5Output, CybookG3Output, CybookOpusOutput, KindleOutput,
iPadOutput, KoboReaderOutput, TabletOutput, iPadOutput, KoboReaderOutput, TabletOutput, SamsungGalaxy,
SonyReaderLandscapeOutput, KindleDXOutput, IlliadOutput, SonyReaderLandscapeOutput, KindleDXOutput, IlliadOutput,
IRexDR1000Output, IRexDR800Output, JetBook5Output, NookOutput, IRexDR1000Output, IRexDR800Output, JetBook5Output, NookOutput,
BambookOutput, NookColorOutput] BambookOutput, NookColorOutput]

View File

@ -27,15 +27,16 @@ class ANDROID(USBMS):
0x040d : { 0x8510 : [0x0001], 0x0851 : [0x1] }, 0x040d : { 0x8510 : [0x0001], 0x0851 : [0x1] },
# Motorola # Motorola
0x22b8 : { 0x41d9 : [0x216], 0x2d67 : [0x100], 0x41db : [0x216], 0x22b8 : { 0x41d9 : [0x216], 0x2d61 : [0x100], 0x2d67 : [0x100],
0x4285 : [0x216], 0x42a3 : [0x216] }, 0x41db : [0x216], 0x4285 : [0x216], 0x42a3 : [0x216],
0x4286 : [0x216], 0x42b3 : [0x216] },
# Sony Ericsson # Sony Ericsson
0xfce : { 0xd12e : [0x0100]}, 0xfce : { 0xd12e : [0x0100]},
# Google # Google
0x18d1 : { 0x4e11 : [0x0100, 0x226, 0x227], 0x4e12: [0x0100, 0x226, 0x18d1 : { 0x4e11 : [0x0100, 0x226, 0x227], 0x4e12: [0x0100, 0x226,
0x227]}, 0x227], 0x4e21: [0x0100, 0x226, 0x227]},
# Samsung # Samsung
0x04e8 : { 0x681d : [0x0222, 0x0223, 0x0224, 0x0400], 0x04e8 : { 0x681d : [0x0222, 0x0223, 0x0224, 0x0400],
@ -52,6 +53,9 @@ class ANDROID(USBMS):
# LG # LG
0x1004 : { 0x61cc : [0x100] }, 0x1004 : { 0x61cc : [0x100] },
# Archos
0x0e79 : { 0x1420 : [0x0216]},
} }
EBOOK_DIR_MAIN = ['eBooks/import', 'wordplayer/calibretransfer', 'Books'] EBOOK_DIR_MAIN = ['eBooks/import', 'wordplayer/calibretransfer', 'Books']
EXTRA_CUSTOMIZATION_MESSAGE = _('Comma separated list of directories to ' EXTRA_CUSTOMIZATION_MESSAGE = _('Comma separated list of directories to '
@ -60,17 +64,19 @@ class ANDROID(USBMS):
EXTRA_CUSTOMIZATION_DEFAULT = ', '.join(EBOOK_DIR_MAIN) EXTRA_CUSTOMIZATION_DEFAULT = ', '.join(EBOOK_DIR_MAIN)
VENDOR_NAME = ['HTC', 'MOTOROLA', 'GOOGLE_', 'ANDROID', 'ACER', VENDOR_NAME = ['HTC', 'MOTOROLA', 'GOOGLE_', 'ANDROID', 'ACER',
'GT-I5700', 'SAMSUNG', 'DELL', 'LINUX', 'GOOGLE'] 'GT-I5700', 'SAMSUNG', 'DELL', 'LINUX', 'GOOGLE', 'ARCHOS']
WINDOWS_MAIN_MEM = ['ANDROID_PHONE', 'A855', 'A853', 'INC.NEXUS_ONE', WINDOWS_MAIN_MEM = ['ANDROID_PHONE', 'A855', 'A853', 'INC.NEXUS_ONE',
'__UMS_COMPOSITE', '_MB200', 'MASS_STORAGE', '_-_CARD', 'SGH-I897', '__UMS_COMPOSITE', '_MB200', 'MASS_STORAGE', '_-_CARD', 'SGH-I897',
'GT-I9000', 'FILE-STOR_GADGET', 'SGH-T959', 'SAMSUNG_ANDROID', 'GT-I9000', 'FILE-STOR_GADGET', 'SGH-T959', 'SAMSUNG_ANDROID',
'SCH-I500_CARD', 'SPH-D700_CARD', 'MB810', 'GT-P1000', 'DESIRE'] 'SCH-I500_CARD', 'SPH-D700_CARD', 'MB810', 'GT-P1000', 'DESIRE',
'SGH-T849', '_MB300', 'A70S']
WINDOWS_CARD_A_MEM = ['ANDROID_PHONE', 'GT-I9000_CARD', 'SGH-I897', WINDOWS_CARD_A_MEM = ['ANDROID_PHONE', 'GT-I9000_CARD', 'SGH-I897',
'FILE-STOR_GADGET', 'SGH-T959', 'SAMSUNG_ANDROID', 'GT-P1000_CARD'] 'FILE-STOR_GADGET', 'SGH-T959', 'SAMSUNG_ANDROID', 'GT-P1000_CARD',
'A70S']
OSX_MAIN_MEM = 'HTC Android Phone Media' OSX_MAIN_MEM = 'Android Device Main Memory'
MAIN_MEMORY_VOLUME_LABEL = 'Android Phone Internal Memory' MAIN_MEMORY_VOLUME_LABEL = 'Android Device Main Memory'
SUPPORTS_SUB_DIRS = True SUPPORTS_SUB_DIRS = True

View File

@ -29,12 +29,16 @@ class BAMBOOK(DeviceConfig, DevicePlugin):
booklist_class = BookList booklist_class = BookList
book_class = Book book_class = Book
ip = None
FORMATS = [ "snb" ] FORMATS = [ "snb" ]
VENDOR_ID = 0x230b VENDOR_ID = 0x230b
PRODUCT_ID = 0x0001 PRODUCT_ID = 0x0001
BCD = None BCD = None
CAN_SET_METADATA = False CAN_SET_METADATA = False
THUMBNAIL_HEIGHT = 155 THUMBNAIL_HEIGHT = 155
EXTRA_CUSTOMIZATION_MESSAGE = \
_("Device IP Address (restart calibre after changing)")
icon = I("devices/bambook.png") icon = I("devices/bambook.png")
# OPEN_FEEDBACK_MESSAGE = _( # OPEN_FEEDBACK_MESSAGE = _(
@ -47,6 +51,10 @@ class BAMBOOK(DeviceConfig, DevicePlugin):
METADATA_FILE_GUID = 'calibremetadata.snb' METADATA_FILE_GUID = 'calibremetadata.snb'
bambook = None bambook = None
is_connected = False
def __init__(self, ip):
self.ip = ip
def reset(self, key='-1', log_packets=False, report_progress=None, def reset(self, key='-1', log_packets=False, report_progress=None,
detected_device=None) : detected_device=None) :
@ -60,15 +68,23 @@ class BAMBOOK(DeviceConfig, DevicePlugin):
self.eject() self.eject()
# Connect # Connect
self.bambook = Bambook() self.bambook = Bambook()
self.bambook.Connect() self.bambook.Connect(ip = self.ip, timeout = 10000)
if self.bambook.GetState() != CONN_CONNECTED: if self.bambook.GetState() != CONN_CONNECTED:
self.bambook = None self.bambook = None
raise Exception(_("Unable to connect to Bambook.")) raise OpenFeedback(_("Unable to connect to Bambook. \n"
"If you are trying to connect via Wi-Fi, "
"please make sure the IP address of Bambook has been correctly configured."))
self.is_connected = True
return True
def unmount_device(self):
self.eject()
def eject(self): def eject(self):
if self.bambook: if self.bambook:
self.bambook.Disconnect() self.bambook.Disconnect()
self.bambook = None self.bambook = None
self.is_connected = False
def post_yank_cleanup(self): def post_yank_cleanup(self):
self.eject() self.eject()
@ -475,3 +491,8 @@ class BAMBOOK(DeviceConfig, DevicePlugin):
def get_guid(uuid): def get_guid(uuid):
guid = hashlib.md5(uuid).hexdigest()[0:15] + ".snb" guid = hashlib.md5(uuid).hexdigest()[0:15] + ".snb"
return guid return guid
class BAMBOOKWifi(BAMBOOK):
def is_usb_connected(self, devices_on_system, debug=False,
only_presence=False):
return self.is_connected, self

View File

@ -329,6 +329,8 @@ class Bambook:
self.handle = None self.handle = None
def Connect(self, ip = DEFAULT_BAMBOOK_IP, timeout = 10000): def Connect(self, ip = DEFAULT_BAMBOOK_IP, timeout = 10000):
if ip == None or ip == '':
ip = DEFAULT_BAMBOOK_IP
self.handle = BambookConnect(ip, timeout) self.handle = BambookConnect(ip, timeout)
if self.handle and self.handle != 0: if self.handle and self.handle != 0:
return True return True

View File

@ -18,9 +18,9 @@ class FOLDER_DEVICE_FOR_CONFIG(USBMS):
supported_platforms = ['windows', 'osx', 'linux'] supported_platforms = ['windows', 'osx', 'linux']
FORMATS = ['epub', 'fb2', 'mobi', 'azw', 'lrf', 'tcr', 'pmlz', 'lit', FORMATS = ['epub', 'fb2', 'mobi', 'azw', 'lrf', 'tcr', 'pmlz', 'lit',
'rtf', 'rb', 'pdf', 'oeb', 'txt', 'pdb', 'prc'] 'rtf', 'rb', 'pdf', 'oeb', 'txt', 'pdb', 'prc']
VENDOR_ID = 0xffff VENDOR_ID = [0xffff]
PRODUCT_ID = 0xffff PRODUCT_ID = [0xffff]
BCD = 0xffff BCD = [0xffff]
DEVICE_PLUGBOARD_NAME = 'FOLDER_DEVICE' DEVICE_PLUGBOARD_NAME = 'FOLDER_DEVICE'
@ -34,9 +34,9 @@ class FOLDER_DEVICE(USBMS):
supported_platforms = ['windows', 'osx', 'linux'] supported_platforms = ['windows', 'osx', 'linux']
FORMATS = FOLDER_DEVICE_FOR_CONFIG.FORMATS FORMATS = FOLDER_DEVICE_FOR_CONFIG.FORMATS
VENDOR_ID = 0xffff VENDOR_ID = [0xffff]
PRODUCT_ID = 0xffff PRODUCT_ID = [0xffff]
BCD = 0xffff BCD = [0xffff]
DEVICE_PLUGBOARD_NAME = 'FOLDER_DEVICE' DEVICE_PLUGBOARD_NAME = 'FOLDER_DEVICE'
THUMBNAIL_HEIGHT = 68 # Height for thumbnails on device THUMBNAIL_HEIGHT = 68 # Height for thumbnails on device

View File

@ -20,11 +20,11 @@ class IRIVER_STORY(USBMS):
FORMATS = ['epub', 'fb2', 'pdf', 'djvu', 'txt'] FORMATS = ['epub', 'fb2', 'pdf', 'djvu', 'txt']
VENDOR_ID = [0x1006] VENDOR_ID = [0x1006]
PRODUCT_ID = [0x4023, 0x4025] PRODUCT_ID = [0x4023, 0x4024, 0x4025]
BCD = [0x0323] BCD = [0x0323]
VENDOR_NAME = 'IRIVER' VENDOR_NAME = 'IRIVER'
WINDOWS_MAIN_MEM = ['STORY', 'STORY_EB05'] WINDOWS_MAIN_MEM = ['STORY', 'STORY_EB05', 'STORY_WI-FI']
WINDOWS_CARD_A_MEM = ['STORY', 'STORY_SD'] WINDOWS_CARD_A_MEM = ['STORY', 'STORY_SD']
#OSX_MAIN_MEM = 'Kindle Internal Storage Media' #OSX_MAIN_MEM = 'Kindle Internal Storage Media'

View File

@ -27,7 +27,7 @@ class Book(Book_):
self.size = size # will be set later if None self.size = size # will be set later if None
if ContentType == '6': if ContentType == '6' and date is not None:
self.datetime = time.strptime(date, "%Y-%m-%dT%H:%M:%S.%f") self.datetime = time.strptime(date, "%Y-%m-%dT%H:%M:%S.%f")
else: else:
try: try:

View File

@ -33,7 +33,7 @@ class KOBO(USBMS):
booklist_class = CollectionsBookList booklist_class = CollectionsBookList
# Ordered list of supported formats # Ordered list of supported formats
FORMATS = ['epub', 'pdf'] FORMATS = ['epub', 'pdf', 'txt', 'cbz', 'cbr']
CAN_SET_METADATA = ['collections'] CAN_SET_METADATA = ['collections']
VENDOR_ID = [0x2237] VENDOR_ID = [0x2237]
@ -409,7 +409,7 @@ class KOBO(USBMS):
else: else:
ContentType = 901 ContentType = 901
else: # if extension == '.html' or extension == '.txt': else: # if extension == '.html' or extension == '.txt':
ContentType = 999 # Yet another hack: to get around Kobo changing how ContentID is stored ContentType = 901 # Yet another hack: to get around Kobo changing how ContentID is stored
return ContentType return ContentType
def path_from_contentid(self, ContentID, ContentType, MimeType, oncard): def path_from_contentid(self, ContentID, ContentType, MimeType, oncard):

View File

@ -259,8 +259,28 @@ class EEEREADER(USBMS):
PRODUCT_ID = [0x178f] PRODUCT_ID = [0x178f]
BCD = [0x0319] BCD = [0x0319]
EBOOK_DIR_MAIN = 'Books' EBOOK_DIR_MAIN = EBOOK_DIR_CARD_A = 'Book'
VENDOR_NAME = 'LINUX' VENDOR_NAME = 'LINUX'
WINDOWS_MAIN_MEM = WINDOWS_CARD_A_MEM = 'FILE-STOR_GADGET' WINDOWS_MAIN_MEM = WINDOWS_CARD_A_MEM = 'FILE-STOR_GADGET'
class NEXTBOOK(USBMS):
name = 'Nextbook device interface'
gui_name = 'Nextbook'
description = _('Communicate with the Nextbook Reader')
author = 'Kovid Goyal'
supported_platforms = ['windows', 'osx', 'linux']
# Ordered list of supported formats
FORMATS = ['epub', 'fb2', 'txt', 'pdf']
VENDOR_ID = [0x05e3]
PRODUCT_ID = [0x0726]
BCD = [0x021a]
EBOOK_DIR_MAIN = ''
VENDOR_NAME = 'NEXT2'
WINDOWS_MAIN_MEM = '1.0.14'

View File

@ -91,3 +91,19 @@ class NOOK_COLOR(NOOK):
EBOOK_DIR_MAIN = 'My Files/Books' EBOOK_DIR_MAIN = 'My Files/Books'
'''
def create_upload_path(self, path, mdata, fname, create_dirs=True):
filepath = NOOK.create_upload_path(self, path, mdata, fname,
create_dirs=create_dirs)
edm = self.EBOOK_DIR_MAIN.replace('/', os.sep)
npath = os.path.join(edm, _('News')) + os.sep
if npath in filepath:
filepath = filepath.replace(npath, os.sep.join('My Files',
'Magazines')+os.sep)
filedir = os.path.dirname(filepath)
if create_dirs and not os.path.exists(filedir):
os.makedirs(filedir)
return filepath
'''

View File

@ -61,14 +61,37 @@ class PRS505(USBMS):
ALL_BY_TITLE = _('All by title') ALL_BY_TITLE = _('All by title')
ALL_BY_AUTHOR = _('All by author') ALL_BY_AUTHOR = _('All by author')
EXTRA_CUSTOMIZATION_MESSAGE = _('Comma separated list of metadata fields ' EXTRA_CUSTOMIZATION_MESSAGE = [
_('Comma separated list of metadata fields '
'to turn into collections on the device. Possibilities include: ')+\ 'to turn into collections on the device. Possibilities include: ')+\
'series, tags, authors' +\ 'series, tags, authors' +\
_('. Two special collections are available: %s:%s and %s:%s. Add ' _('. Two special collections are available: %s:%s and %s:%s. Add '
'these values to the list to enable them. The collections will be ' 'these values to the list to enable them. The collections will be '
'given the name provided after the ":" character.')%( 'given the name provided after the ":" character.')%(
'abt', ALL_BY_TITLE, 'aba', ALL_BY_AUTHOR) 'abt', ALL_BY_TITLE, 'aba', ALL_BY_AUTHOR),
EXTRA_CUSTOMIZATION_DEFAULT = ', '.join(['series', 'tags']) _('Upload separate cover thumbnails for books (newer readers)') +
':::'+_('Normally, the SONY readers get the cover image from the'
' ebook file itself. With this option, calibre will send a '
'separate cover image to the reader, useful if you are '
'sending DRMed books in which you cannot change the cover.'
' WARNING: This option should only be used with newer '
'SONY readers: 350, 650, 950 and newer.'),
_('Refresh separate covers when using automatic management (newer readers)') +
':::' +
_('Set this option to have separate book covers uploaded '
'every time you connect your device. Unset this option if '
'you have so many books on the reader that performance is '
'unacceptable.')
]
EXTRA_CUSTOMIZATION_DEFAULT = [
', '.join(['series', 'tags']),
False,
False
]
OPT_COLLECTIONS = 0
OPT_UPLOAD_COVERS = 1
OPT_REFRESH_COVERS = 2
plugboard = None plugboard = None
plugboard_func = None plugboard_func = None
@ -159,7 +182,7 @@ class PRS505(USBMS):
opts = self.settings() opts = self.settings()
if opts.extra_customization: if opts.extra_customization:
collections = [x.strip() for x in collections = [x.strip() for x in
opts.extra_customization.split(',')] opts.extra_customization[self.OPT_COLLECTIONS].split(',')]
else: else:
collections = [] collections = []
debug_print('PRS505: collection fields:', collections) debug_print('PRS505: collection fields:', collections)
@ -171,6 +194,20 @@ class PRS505(USBMS):
c.update(blists, collections, pb) c.update(blists, collections, pb)
c.write() c.write()
if opts.extra_customization[self.OPT_REFRESH_COVERS]:
debug_print('PRS505: uploading covers in sync_booklists')
for idx,bl in blists.items():
prefix = self._card_a_prefix if idx == 1 else \
self._card_b_prefix if idx == 2 \
else self._main_prefix
for book in bl:
p = os.path.join(prefix, book.lpath)
self._upload_cover(os.path.dirname(p),
os.path.splitext(os.path.basename(p))[0],
book, p)
else:
debug_print('PRS505: NOT uploading covers in sync_booklists')
USBMS.sync_booklists(self, booklists, end_session=end_session) USBMS.sync_booklists(self, booklists, end_session=end_session)
debug_print('PRS505: finished sync_booklists') debug_print('PRS505: finished sync_booklists')
@ -186,8 +223,15 @@ class PRS505(USBMS):
self.plugboard_func = pb_func self.plugboard_func = pb_func
def upload_cover(self, path, filename, metadata, filepath): def upload_cover(self, path, filename, metadata, filepath):
return # Disabled as the SONY's don't need this thumbnail anyway and opts = self.settings()
# older models don't auto delete it if not opts.extra_customization[self.OPT_UPLOAD_COVERS]:
# Building thumbnails disabled
debug_print('PRS505: not uploading cover')
return
debug_print('PRS505: uploading cover')
self._upload_cover(path, filename, metadata, filepath)
def _upload_cover(self, path, filename, metadata, filepath):
if metadata.thumbnail and metadata.thumbnail[-1]: if metadata.thumbnail and metadata.thumbnail[-1]:
path = path.replace('/', os.sep) path = path.replace('/', os.sep)
is_main = path.startswith(self._main_prefix) is_main = path.startswith(self._main_prefix)

View File

@ -30,6 +30,12 @@ class Drive(str):
typ.order = order typ.order = order
return typ return typ
def drivecmp(a, b):
ans = cmp(getattr(a, 'order', 0), getattr(b, 'order', 0))
if ans == 0:
ans = cmp(a, b)
return ans
class WinPNPScanner(object): class WinPNPScanner(object):
@ -57,7 +63,13 @@ class WinPNPScanner(object):
order = 0 order = 0
match = re.search(r'REV_.*?&(\d+)#', pnp_id) match = re.search(r'REV_.*?&(\d+)#', pnp_id)
if match is None: if match is None:
match = re.search(r'REV_.*?&(\d+)', pnp_id) # Windows XP
# On the Nook Color this is the last digit
#
# USBSTOR\DISK&VEN_B&N&PROD_EBOOK_DISK&REV_0100\7&13EAFDB8&0&2004760017462009&1
# USBSTOR\DISK&VEN_B&N&PROD_EBOOK_DISK&REV_0100\7&13EAFDB8&0&2004760017462009&0
#
match = re.search(r'REV_.*&(\d+)', pnp_id)
if match is not None: if match is not None:
order = int(match.group(1)) order = int(match.group(1))
return order return order

View File

@ -140,11 +140,19 @@ class CollectionsBookList(BookList):
all_by_author = '' all_by_author = ''
all_by_title = '' all_by_title = ''
ca = [] ca = []
all_by_something = []
for c in collection_attributes: for c in collection_attributes:
if c.startswith('aba:') and c[4:]: if c.startswith('aba:') and c[4:].strip():
all_by_author = c[4:].strip() all_by_author = c[4:].strip()
elif c.startswith('abt:') and c[4:]: elif c.startswith('abt:') and c[4:].strip():
all_by_title = c[4:].strip() all_by_title = c[4:].strip()
elif c.startswith('abs:') and c[4:].strip():
name = c[4:].strip()
sby = self.in_category_sort_rules(name)
if sby is None:
sby = name
if name and sby:
all_by_something.append((name, sby))
else: else:
ca.append(c.lower()) ca.append(c.lower())
collection_attributes = ca collection_attributes = ca
@ -251,6 +259,10 @@ class CollectionsBookList(BookList):
if all_by_title not in collections: if all_by_title not in collections:
collections[all_by_title] = {} collections[all_by_title] = {}
collections[all_by_title][lpath] = (book, tsval, asval) collections[all_by_title][lpath] = (book, tsval, asval)
for (n, sb) in all_by_something:
if n not in collections:
collections[n] = {}
collections[n][lpath] = (book, book.get(sb, ''), tsval)
# Sort collections # Sort collections
result = {} result = {}

View File

@ -11,7 +11,7 @@ intended to be subclassed with the relevant parts implemented for a particular
device. This class handles device detection. device. This class handles device detection.
''' '''
import os, subprocess, time, re, sys, glob, operator import os, subprocess, time, re, sys, glob
from itertools import repeat from itertools import repeat
from calibre.devices.interface import DevicePlugin from calibre.devices.interface import DevicePlugin
@ -225,7 +225,7 @@ class Device(DeviceConfig, DevicePlugin):
return False return False
def open_windows(self): def open_windows(self):
from calibre.devices.scanner import win_pnp_drives from calibre.devices.scanner import win_pnp_drives, drivecmp
time.sleep(5) time.sleep(5)
drives = {} drives = {}
@ -263,7 +263,7 @@ class Device(DeviceConfig, DevicePlugin):
if self.WINDOWS_MAIN_MEM in (self.WINDOWS_CARD_A_MEM, if self.WINDOWS_MAIN_MEM in (self.WINDOWS_CARD_A_MEM,
self.WINDOWS_CARD_B_MEM) or \ self.WINDOWS_CARD_B_MEM) or \
self.WINDOWS_CARD_A_MEM == self.WINDOWS_CARD_B_MEM: self.WINDOWS_CARD_A_MEM == self.WINDOWS_CARD_B_MEM:
letters = sorted(drives.values(), key=operator.attrgetter('order')) letters = sorted(drives.values(), cmp=drivecmp)
drives = {} drives = {}
for which, letter in zip(['main', 'carda', 'cardb'], letters): for which, letter in zip(['main', 'carda', 'cardb'], letters):
drives[which] = letter drives[which] = letter

View File

@ -10,7 +10,21 @@ from calibre.utils.config import Config, ConfigProxy
class DeviceConfig(object): class DeviceConfig(object):
HELP_MESSAGE = _('Configure Device') HELP_MESSAGE = _('Configure Device')
#: Can be None, a string or a list of strings. When it is a string
#: that string is used for the help text and the actual customization value
#: can be read from ``dev.settings().extra_customization``.
#: If it a list of strings, then dev.settings().extra_customization will
#: also be a list. In this case, you *must* ensure that
#: EXTRA_CUSTOMIZATION_DEFAULT is also a list. The list can contain either
#: boolean values or strings, in which case a checkbox or line edit will be
#: used for them in the config widget, automatically.
#: If a string contains ::: then the text after it is interpreted as the
#: tooltip
EXTRA_CUSTOMIZATION_MESSAGE = None EXTRA_CUSTOMIZATION_MESSAGE = None
#: The default value for extra customization. If you set
#: EXTRA_CUSTOMIZATION_MESSAGE you *must* set this as well.
EXTRA_CUSTOMIZATION_DEFAULT = None EXTRA_CUSTOMIZATION_DEFAULT = None
SUPPORTS_SUB_DIRS = False SUPPORTS_SUB_DIRS = False
@ -73,16 +87,33 @@ class DeviceConfig(object):
if cls.SUPPORTS_USE_AUTHOR_SORT: if cls.SUPPORTS_USE_AUTHOR_SORT:
proxy['use_author_sort'] = config_widget.use_author_sort() proxy['use_author_sort'] = config_widget.use_author_sort()
if cls.EXTRA_CUSTOMIZATION_MESSAGE: if cls.EXTRA_CUSTOMIZATION_MESSAGE:
ec = unicode(config_widget.opt_extra_customization.text()).strip() if isinstance(cls.EXTRA_CUSTOMIZATION_MESSAGE, list):
if not ec: ec = []
ec = None for i in range(0, len(cls.EXTRA_CUSTOMIZATION_MESSAGE)):
if hasattr(config_widget.opt_extra_customization[i], 'isChecked'):
ec.append(config_widget.opt_extra_customization[i].isChecked())
else:
ec.append(unicode(config_widget.opt_extra_customization[i].text()).strip())
else:
ec = unicode(config_widget.opt_extra_customization.text()).strip()
if not ec:
ec = None
proxy['extra_customization'] = ec proxy['extra_customization'] = ec
st = unicode(config_widget.opt_save_template.text()) st = unicode(config_widget.opt_save_template.text())
proxy['save_template'] = st proxy['save_template'] = st
@classmethod @classmethod
def settings(cls): def settings(cls):
return cls._config().parse() opts = cls._config().parse()
if isinstance(cls.EXTRA_CUSTOMIZATION_DEFAULT, list):
if opts.extra_customization is None:
opts.extra_customization = []
if not isinstance(opts.extra_customization, list):
opts.extra_customization = [opts.extra_customization]
for i,d in enumerate(cls.EXTRA_CUSTOMIZATION_DEFAULT):
if i >= len(opts.extra_customization):
opts.extra_customization.append(d)
return opts
@classmethod @classmethod
def save_template(cls): def save_template(cls):

View File

@ -18,7 +18,7 @@
__version__ = "1.0" __version__ = "1.0"
import re import re, codecs
def detect(aBuf): def detect(aBuf):
import calibre.ebooks.chardet.universaldetector as universaldetector import calibre.ebooks.chardet.universaldetector as universaldetector
@ -83,9 +83,11 @@ def xml_to_unicode(raw, verbose=False, strip_encoding_pats=False,
if not raw: if not raw:
return u'', encoding return u'', encoding
if not isinstance(raw, unicode): if not isinstance(raw, unicode):
if raw.startswith('\xff\xfe'): if raw.startswith(codecs.BOM_UTF8):
raw, encoding = raw.decode('utf-8')[1:], 'utf-8'
elif raw.startswith(codecs.BOM_UTF16_LE):
raw, encoding = raw.decode('utf-16-le')[1:], 'utf-16-le' raw, encoding = raw.decode('utf-16-le')[1:], 'utf-16-le'
elif raw.startswith('\xfe\xff'): elif raw.startswith(codecs.BOM_UTF16_BE):
raw, encoding = raw.decode('utf-16-be')[1:], 'utf-16-be' raw, encoding = raw.decode('utf-16-be')[1:], 'utf-16-be'
if not isinstance(raw, unicode): if not isinstance(raw, unicode):
for pat in ENCODING_PATS: for pat in ENCODING_PATS:

View File

@ -6,11 +6,118 @@ __docformat__ = 'restructuredtext en'
import re import re
class TCRCompressor(object):
'''
TCR compression takes the form header+code_dict+coded_text.
The header is always "!!8-Bit!!". The code dict is a list of 256 strings.
The list takes the form 1 byte length and then a string. Each position in
The list corresponds to a code found in the file. The coded text is
string of characters values. for instance the character Q represents the
value 81 which corresponds to the string in the code list at position 81.
'''
def _reset(self):
# List of indexes in the codes list that are empty and can hold new codes
self.unused_codes = set()
self.coded_txt = ''
# Generate initial codes from text.
# The index of the list will be the code that represents the characters at that location
# in the list
self.codes = []
def _combine_codes(self):
'''
Combine two codes that always appear in pair into a single code.
The intent is to create more unused codes.
'''
possible_codes = []
a_code = set(re.findall('(?msu).', self.coded_txt))
for code in a_code:
single_code = set(re.findall('(?msu)%s.' % re.escape(code), self.coded_txt))
if len(single_code) == 1:
possible_codes.append(single_code.pop())
for code in possible_codes:
self.coded_txt = self.coded_txt.replace(code, code[0])
self.codes[ord(code[0])] = '%s%s' % (self.codes[ord(code[0])], self.codes[ord(code[1])])
def _free_unused_codes(self):
'''
Look for codes that do no not appear in the coded text and add them to
the list of free codes.
'''
for i in xrange(256):
if i not in self.unused_codes:
if chr(i) not in self.coded_txt:
self.unused_codes.add(i)
def _new_codes(self):
'''
Create new codes from codes that occur in pairs often.
'''
possible_new_codes = list(set(re.findall('(?msu)..', self.coded_txt)))
new_codes_count = []
for c in possible_new_codes:
count = self.coded_txt.count(c)
# Less than 3 occurrences will not produce any size reduction.
if count > 2:
new_codes_count.append((c, count))
# Arrange the codes in order of least to most occurring.
possible_new_codes = [x[0] for x in sorted(new_codes_count, key=lambda c: c[1])]
return possible_new_codes
def compress(self, txt):
self._reset()
self.codes = list(set(re.findall('(?msu).', txt)))
# Replace the text with their corresponding code
for c in txt:
self.coded_txt += chr(self.codes.index(c))
# Zero the unused codes and record which are unused.
for i in range(len(self.codes), 256):
self.codes.append('')
self.unused_codes.add(i)
self._combine_codes()
possible_codes = self._new_codes()
while possible_codes and self.unused_codes:
while possible_codes and self.unused_codes:
unused_code = self.unused_codes.pop()
# Take the last possible codes and split it into individual
# codes. The last possible code is the most often occurring.
code1, code2 = possible_codes.pop()
self.codes[unused_code] = '%s%s' % (self.codes[ord(code1)], self.codes[ord(code2)])
self.coded_txt = self.coded_txt.replace('%s%s' % (code1, code2), chr(unused_code))
self._combine_codes()
self._free_unused_codes()
possible_codes = self._new_codes()
self._free_unused_codes()
# Generate the code dictionary.
code_dict = []
for i in xrange(0, 256):
if i in self.unused_codes:
code_dict.append(chr(0))
else:
code_dict.append(chr(len(self.codes[i])) + self.codes[i])
# Join the identifier with the dictionary and coded text.
return '!!8-Bit!!'+''.join(code_dict)+self.coded_txt
def decompress(stream): def decompress(stream):
txt = [] txt = []
stream.seek(0) stream.seek(0)
if stream.read(9) != '!!8-Bit!!': if stream.read(9) != '!!8-Bit!!':
raise ValueError('File %s contaions an invalid TCR header.' % stream.name) raise ValueError('File %s contains an invalid TCR header.' % stream.name)
# Codes that the file contents are broken down into. # Codes that the file contents are broken down into.
entries = [] entries = []
@ -26,101 +133,6 @@ def decompress(stream):
return ''.join(txt) return ''.join(txt)
def compress(txt):
def compress(txt, level=5): t = TCRCompressor()
''' return t.compress(txt)
TCR compression takes the form header+code_list+coded_text.
The header is always "!!8-Bit!!". The code list is a list of 256 strings.
The list takes the form 1 byte length and then a string. Each position in
The list corresponds to a code found in the file. The coded text is
string of characters vaules. for instance the character Q represents the
value 81 which corresponds to the string in the code list at position 81.
'''
# Turn each unique character into a coded value.
# The code of the string at a given position are represented by the position
# they occupy in the list.
codes = list(set(re.findall('(?msu).', txt)))
for i in range(len(codes), 256):
codes.append('')
# Set the compression level.
if level <= 1:
new_length = 256
if level >= 10:
new_length = 1
else:
new_length = int(256 * (10 - level) * .1)
new_length = 1 if new_length < 1 else new_length
# Replace txt with codes.
coded_txt = ''
for c in txt:
coded_txt += chr(codes.index(c))
txt = coded_txt
# Start compressing the text.
new = True
merged = True
while new or merged:
# Merge codes that always follow another code
merge = []
merged = False
for i in xrange(256):
if codes[i] != '':
# Find all codes that are next to i.
fall = list(set(re.findall('(?msu)%s.' % re.escape(chr(i)), txt)))
# 1 if only one code comes after i.
if len(fall) == 1:
# We are searching codes and each code is always 1 character.
j = ord(fall[0][1:2])
# Only merge if the total length of the string represented by
# code is less than 256.
if len(codes[i]) + len(codes[j]) < 256:
merge.append((i, j))
if merge:
merged = True
for i, j in merge:
# Merge the string for j into the string for i.
if i == j:
# Don't use += here just in case something goes wrong. This
# will prevent out of control memory consumption. This is
# unecessary but when creating this routine it happened due
# to an error.
codes[i] = codes[i] + codes[i]
else:
codes[i] = codes[i] + codes[j]
txt = txt.replace(chr(i)+chr(j), chr(i))
if chr(j) not in txt:
codes[j] = ''
new = False
if '' in codes:
# Create a list of codes based on combinations of codes that are next
# to each other. The amount of savings for the new code is calculated.
new_codes = []
for c in list(set(re.findall('(?msu)..', txt))):
i = ord(c[0:1])
j = ord(c[1:2])
if codes[i]+codes[j] in codes:
continue
savings = txt.count(chr(i)+chr(j)) - len(codes[i]) - len(codes[j])
if savings > 2 and len(codes[i]) + len(codes[j]) < 256:
new_codes.append((savings, i, j, codes[i], codes[j]))
if new_codes:
new = True
# Sort the codes from highest savings to lowest.
new_codes.sort(lambda x, y: -1 if x[0] > y[0] else 1 if x[0] < y[0] else 0)
# The shorter new_length the more chances time merging will happen
# giving more changes for better codes to be created. However,
# the shorter new_lengh the longer it will take to compress.
new_codes = new_codes[:new_length]
for code in new_codes:
if '' not in codes:
break
c = codes.index('')
codes[c] = code[3]+code[4]
txt = txt.replace(chr(code[1])+chr(code[2]), chr(c))
# Generate the code dictionary.
header = []
for code in codes:
header.append(chr(len(code))+code)
for i in xrange(len(header), 256):
header.append(chr(0))
# Join the identifier with the dictionary and coded text.
return '!!8-Bit!!'+''.join(header)+txt

View File

@ -88,6 +88,7 @@ class Plumber(object):
self.ui_reporter = report_progress self.ui_reporter = report_progress
self.abort_after_input_dump = abort_after_input_dump self.abort_after_input_dump = abort_after_input_dump
# Pipeline options {{{
# Initialize the conversion options that are independent of input and # Initialize the conversion options that are independent of input and
# output formats. The input and output plugins can still disable these # output formats. The input and output plugins can still disable these
# options via recommendations. # options via recommendations.
@ -527,6 +528,7 @@ OptionRecommendation(name='timestamp',
help=_('Set the book timestamp (used by the date column in calibre).')), help=_('Set the book timestamp (used by the date column in calibre).')),
] ]
# }}}
input_fmt = os.path.splitext(self.input)[1] input_fmt = os.path.splitext(self.input)[1]
if not input_fmt: if not input_fmt:
@ -977,6 +979,8 @@ def create_oebbook(log, path_or_stream, opts, input_plugin, reader=None,
from calibre.ebooks.oeb.base import OEBBook from calibre.ebooks.oeb.base import OEBBook
html_preprocessor = HTMLPreProcessor(input_plugin.preprocess_html, html_preprocessor = HTMLPreProcessor(input_plugin.preprocess_html,
opts.preprocess_html, opts) opts.preprocess_html, opts)
if not encoding:
encoding = None
oeb = OEBBook(log, html_preprocessor, oeb = OEBBook(log, html_preprocessor,
pretty_print=opts.pretty_print, input_encoding=encoding) pretty_print=opts.pretty_print, input_encoding=encoding)
if not populate: if not populate:

View File

@ -51,16 +51,16 @@ def chap_head(match):
chap = match.group('chap') chap = match.group('chap')
title = match.group('title') title = match.group('title')
if not title: if not title:
return '<h1>'+chap+'</h1><br/>\n' return '<h1>'+chap+'</h1><br/>\n'
else: else:
return '<h1>'+chap+'</h1>\n<h3>'+title+'</h3>\n' return '<h1>'+chap+'</h1>\n<h3>'+title+'</h3>\n'
def wrap_lines(match): def wrap_lines(match):
ital = match.group('ital') ital = match.group('ital')
if not ital: if not ital:
return ' ' return ' '
else: else:
return ital+' ' return ital+' '
class DocAnalysis(object): class DocAnalysis(object):
''' '''
@ -78,6 +78,8 @@ class DocAnalysis(object):
linere = re.compile('(?<=<br>)(?!\s*<br>).*?(?=<br>)', re.DOTALL) linere = re.compile('(?<=<br>)(?!\s*<br>).*?(?=<br>)', re.DOTALL)
elif format == 'spanned_html': elif format == 'spanned_html':
linere = re.compile('(?<=<span).*?(?=</span>)', re.DOTALL) linere = re.compile('(?<=<span).*?(?=</span>)', re.DOTALL)
elif format == 'txt':
linere = re.compile('.*?\n')
self.lines = linere.findall(raw) self.lines = linere.findall(raw)
def line_length(self, percent): def line_length(self, percent):
@ -175,7 +177,7 @@ class Dehyphenator(object):
def __init__(self): def __init__(self):
# Add common suffixes to the regex below to increase the likelihood of a match - # Add common suffixes to the regex below to increase the likelihood of a match -
# don't add suffixes which are also complete words, such as 'able' or 'sex' # don't add suffixes which are also complete words, such as 'able' or 'sex'
self.removesuffixes = re.compile(r"((ed)?ly|('e)?s|a?(t|s)?ion(s|al(ly)?)?|ings?|er|(i)?ous|(i|a)ty|(it)?ies|ive|gence|istic(ally)?|(e|a)nce|ment(s)?|ism|ated|(e|u)ct(ed)?|ed|(i|ed)?ness|(e|a)ncy|ble|ier|al|ex)$", re.IGNORECASE) self.removesuffixes = re.compile(r"((ed)?ly|('e)?s|a?(t|s)?ion(s|al(ly)?)?|ings?|er|(i)?ous|(i|a)ty|(it)?ies|ive|gence|istic(ally)?|(e|a)nce|m?ents?|ism|ated|(e|u)ct(ed)?|ed|(i|ed)?ness|(e|a)ncy|ble|ier|al|ex|ian)$", re.IGNORECASE)
# remove prefixes if the prefix was not already the point of hyphenation # remove prefixes if the prefix was not already the point of hyphenation
self.prefixes = re.compile(r'^(dis|re|un|in|ex)$', re.IGNORECASE) self.prefixes = re.compile(r'^(dis|re|un|in|ex)$', re.IGNORECASE)
self.removeprefix = re.compile(r'^(dis|re|un|in|ex)', re.IGNORECASE) self.removeprefix = re.compile(r'^(dis|re|un|in|ex)', re.IGNORECASE)
@ -191,13 +193,13 @@ class Dehyphenator(object):
dehyphenated = unicode(firsthalf) + unicode(secondhalf) dehyphenated = unicode(firsthalf) + unicode(secondhalf)
lookupword = self.removesuffixes.sub('', dehyphenated) lookupword = self.removesuffixes.sub('', dehyphenated)
if self.prefixes.match(firsthalf) is None: if self.prefixes.match(firsthalf) is None:
lookupword = self.removeprefix.sub('', lookupword) lookupword = self.removeprefix.sub('', lookupword)
#print "lookup word is: "+str(lookupword)+", orig is: " + str(hyphenated) #print "lookup word is: "+str(lookupword)+", orig is: " + str(hyphenated)
try: try:
searchresult = self.html.find(lookupword.lower()) searchresult = self.html.find(lookupword.lower())
except: except:
return hyphenated return hyphenated
if self.format == 'html_cleanup': if self.format == 'html_cleanup' or self.format == 'txt_cleanup':
if self.html.find(lookupword) != -1 or searchresult != -1: if self.html.find(lookupword) != -1 or searchresult != -1:
#print "Cleanup:returned dehyphenated word: " + str(dehyphenated) #print "Cleanup:returned dehyphenated word: " + str(dehyphenated)
return dehyphenated return dehyphenated
@ -223,10 +225,15 @@ class Dehyphenator(object):
intextmatch = re.compile(u'(?<=.{%i})(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|)\s*(?=<)(?P<wraptags>(</span>)?\s*(</[iubp]>\s*){1,2}(?P<up2threeblanks><(p|div)[^>]*>\s*(<p[^>]*>\s*</p>\s*)?</(p|div)>\s+){0,3}\s*(<[iubp][^>]*>\s*){1,2}(<span[^>]*>)?)\s*(?P<secondpart>[\w\d]+)' % length) intextmatch = re.compile(u'(?<=.{%i})(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|)\s*(?=<)(?P<wraptags>(</span>)?\s*(</[iubp]>\s*){1,2}(?P<up2threeblanks><(p|div)[^>]*>\s*(<p[^>]*>\s*</p>\s*)?</(p|div)>\s+){0,3}\s*(<[iubp][^>]*>\s*){1,2}(<span[^>]*>)?)\s*(?P<secondpart>[\w\d]+)' % length)
elif format == 'pdf': elif format == 'pdf':
intextmatch = re.compile(u'(?<=.{%i})(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|)\s*(?P<wraptags><p>|</[iub]>\s*<p>\s*<[iub]>)\s*(?P<secondpart>[\w\d]+)'% length) intextmatch = re.compile(u'(?<=.{%i})(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|)\s*(?P<wraptags><p>|</[iub]>\s*<p>\s*<[iub]>)\s*(?P<secondpart>[\w\d]+)'% length)
elif format == 'txt':
intextmatch = re.compile(u'(?<=.{%i})(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|)(\u0020|\u0009)*(?P<wraptags>(\n(\u0020|\u0009)*)+)(?P<secondpart>[\w\d]+)'% length)
elif format == 'individual_words': elif format == 'individual_words':
intextmatch = re.compile(u'>[^<]*\b(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)"\s>]+)(-|)(?P<secondpart)\w+)\b[^<]*<') # for later, not called anywhere yet intextmatch = re.compile(u'>[^<]*\b(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)"\s>]+)(-|)\u0020*(?P<secondpart>\w+)\b[^<]*<') # for later, not called anywhere yet
elif format == 'html_cleanup': elif format == 'html_cleanup':
intextmatch = re.compile(u'(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|)\s*(?=<)(?P<wraptags></span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?P<secondpart>[\w\d]+)') intextmatch = re.compile(u'(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|)\s*(?=<)(?P<wraptags></span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?P<secondpart>[\w\d]+)')
elif format == 'txt_cleanup':
intextmatch = re.compile(u'(?P<firstpart>\w+)(-|)(?P<wraptags>\s+)(?P<secondpart>[\w\d]+)')
html = intextmatch.sub(self.dehyphenate, html) html = intextmatch.sub(self.dehyphenate, html)
return html return html
@ -353,7 +360,7 @@ class HTMLPreProcessor(object):
(re.compile(r'((?<=</a>)\s*file:////?[A-Z].*<br>|file:////?[A-Z].*<br>(?=\s*<hr>))', re.IGNORECASE), lambda match: ''), (re.compile(r'((?<=</a>)\s*file:////?[A-Z].*<br>|file:////?[A-Z].*<br>(?=\s*<hr>))', re.IGNORECASE), lambda match: ''),
# Center separator lines # Center separator lines
(re.compile(u'<br>\s*(?P<break>([*#•]+\s*)+)\s*<br>'), lambda match: '<p>\n<p style="text-align:center">' + match.group(1) + '</p>'), (re.compile(u'<br>\s*(?P<break>([*#•]+\s*)+)\s*<br>'), lambda match: '<p>\n<p style="text-align:center">' + match.group(1) + '</p>'),
# Remove page links # Remove page links
(re.compile(r'<a name=\d+></a>', re.IGNORECASE), lambda match: ''), (re.compile(r'<a name=\d+></a>', re.IGNORECASE), lambda match: ''),
@ -363,13 +370,11 @@ class HTMLPreProcessor(object):
# Remove gray background # Remove gray background
(re.compile(r'<BODY[^<>]+>'), lambda match : '<BODY>'), (re.compile(r'<BODY[^<>]+>'), lambda match : '<BODY>'),
# Detect Chapters to match default XPATH in GUI # Convert line breaks to paragraphs
(re.compile(r'<br>\s*(?P<chap>(<[ibu]>){0,2}\s*.?(Introduction|Chapter|Kapitel|Epilogue|Prologue|Book|Part|Dedication|Volume|Preface|Acknowledgments)\s*([\d\w-]+\s*){0,3}\s*(</[ibu]>){0,2})\s*(<br>\s*){1,3}\s*(?P<title>(<[ibu]>){0,2}(\s*\w+){1,4}\s*(</[ibu]>){0,2}\s*<br>)?', re.IGNORECASE), chap_head), (re.compile(r'<br[^>]*>\s*'), lambda match : '</p>\n<p>'),
# Cover the case where every letter in a chapter title is separated by a space (re.compile(r'<body[^>]*>\s*'), lambda match : '<body>\n<p>'),
(re.compile(r'<br>\s*(?P<chap>([A-Z]\s+){4,}\s*([\d\w-]+\s*){0,3}\s*)\s*(<br>\s*){1,3}\s*(?P<title>(<[ibu]>){0,2}(\s*\w+){1,4}\s*(</[ibu]>){0,2}\s*(<br>))?'), chap_head), (re.compile(r'\s*</body>'), lambda match : '</p>\n</body>'),
# Have paragraphs show better
(re.compile(r'<br.*?>'), lambda match : '<p>'),
# Clean up spaces # Clean up spaces
(re.compile(u'(?<=[\.,;\?!”"\'])[\s^ ]*(?=<)'), lambda match: ' '), (re.compile(u'(?<=[\.,;\?!”"\'])[\s^ ]*(?=<)'), lambda match: ' '),
# Add space before and after italics # Add space before and after italics
@ -455,9 +460,9 @@ class HTMLPreProcessor(object):
# delete soft hyphens - moved here so it's executed after header/footer removal # delete soft hyphens - moved here so it's executed after header/footer removal
if is_pdftohtml: if is_pdftohtml:
# unwrap/delete soft hyphens # unwrap/delete soft hyphens
end_rules.append((re.compile(u'[­](\s*<p>)+\s*(?=[[a-z\d])'), lambda match: '')) end_rules.append((re.compile(u'[­](</p>\s*<p>\s*)+\s*(?=[[a-z\d])'), lambda match: ''))
# unwrap/delete soft hyphens with formatting # unwrap/delete soft hyphens with formatting
end_rules.append((re.compile(u'[­]\s*(</(i|u|b)>)+(\s*<p>)+\s*(<(i|u|b)>)+\s*(?=[[a-z\d])'), lambda match: '')) end_rules.append((re.compile(u'[­]\s*(</(i|u|b)>)+(</p>\s*<p>\s*)+\s*(<(i|u|b)>)+\s*(?=[[a-z\d])'), lambda match: ''))
# Make the more aggressive chapter marking regex optional with the preprocess option to # Make the more aggressive chapter marking regex optional with the preprocess option to
# reduce false positives and move after header/footer removal # reduce false positives and move after header/footer removal
@ -475,7 +480,7 @@ class HTMLPreProcessor(object):
end_rules.append((re.compile(u'(?<=.{%i}[–—])\s*<p>\s*(?=[[a-z\d])' % length), lambda match: '')) end_rules.append((re.compile(u'(?<=.{%i}[–—])\s*<p>\s*(?=[[a-z\d])' % length), lambda match: ''))
end_rules.append( end_rules.append(
# Un wrap using punctuation # Un wrap using punctuation
(re.compile(u'(?<=.{%i}([a-zäëïöüàèìòùáćéíóńśúâêîôûçąężı,:)\IA\u00DF]|(?<!\&\w{4});))\s*(?P<ital></(i|b|u)>)?\s*(<p.*?>\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines), (re.compile(u'(?<=.{%i}([a-zäëïöüàèìòùáćéíóńśúâêîôûçąężıãõñæøþðß,:)\IA\u00DF]|(?<!\&\w{4});))\s*(?P<ital></(i|b|u)>)?\s*(</p>\s*<p>\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines),
) )
for rule in self.PREPROCESS + start_rules: for rule in self.PREPROCESS + start_rules:
@ -508,7 +513,15 @@ class HTMLPreProcessor(object):
if is_pdftohtml and length > -1: if is_pdftohtml and length > -1:
# Dehyphenate # Dehyphenate
dehyphenator = Dehyphenator() dehyphenator = Dehyphenator()
html = dehyphenator(html,'pdf', length) html = dehyphenator(html,'html', length)
if is_pdftohtml:
from calibre.ebooks.conversion.utils import PreProcessor
pdf_markup = PreProcessor(self.extra_opts, None)
totalwords = 0
totalwords = pdf_markup.get_word_count(html)
if totalwords > 7000:
html = pdf_markup.markup_chapters(html, totalwords, True)
#dump(html, 'post-preprocess') #dump(html, 'post-preprocess')
@ -554,5 +567,9 @@ class HTMLPreProcessor(object):
html = smartyPants(html) html = smartyPants(html)
html = html.replace(start, '<!--') html = html.replace(start, '<!--')
html = html.replace(stop, '-->') html = html.replace(stop, '-->')
# convert ellipsis to entities to prevent wrapping
html = re.sub(r'(?u)(?<=\w)\s?(\.\s?){2}\.', '&hellip;', html)
# convert double dashes to em-dash
html = re.sub(r'\s--\s', u'\u2014', html)
return substitute_entites(html) return substitute_entites(html)

View File

@ -6,8 +6,10 @@ __copyright__ = '2010, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en' __docformat__ = 'restructuredtext en'
import re import re
from math import ceil
from calibre.ebooks.conversion.preprocess import DocAnalysis, Dehyphenator from calibre.ebooks.conversion.preprocess import DocAnalysis, Dehyphenator
from calibre.utils.logging import default_log from calibre.utils.logging import default_log
from calibre.utils.wordcount import get_wordcount_obj
class PreProcessor(object): class PreProcessor(object):
@ -17,6 +19,9 @@ class PreProcessor(object):
self.found_indents = 0 self.found_indents = 0
self.extra_opts = extra_opts self.extra_opts = extra_opts
def is_pdftohtml(self, src):
return '<!-- created by calibre\'s pdftohtml -->' in src[:1000]
def chapter_head(self, match): def chapter_head(self, match):
chap = match.group('chap') chap = match.group('chap')
title = match.group('title') title = match.group('title')
@ -64,7 +69,7 @@ class PreProcessor(object):
inspect. Percent is the minimum percent of line endings which should inspect. Percent is the minimum percent of line endings which should
be marked up to return true. be marked up to return true.
''' '''
htm_end_ere = re.compile('</p>', re.DOTALL) htm_end_ere = re.compile('</(p|div)>', re.DOTALL)
line_end_ere = re.compile('(\n|\r|\r\n)', re.DOTALL) line_end_ere = re.compile('(\n|\r|\r\n)', re.DOTALL)
htm_end = htm_end_ere.findall(raw) htm_end = htm_end_ere.findall(raw)
line_end = line_end_ere.findall(raw) line_end = line_end_ere.findall(raw)
@ -101,36 +106,140 @@ class PreProcessor(object):
with open(os.path.join(odir, name), 'wb') as f: with open(os.path.join(odir, name), 'wb') as f:
f.write(raw.encode('utf-8')) f.write(raw.encode('utf-8'))
def get_word_count(self, html):
word_count_text = re.sub(r'(?s)<head[^>]*>.*?</head>', '', html)
word_count_text = re.sub(r'<[^>]*>', '', word_count_text)
wordcount = get_wordcount_obj(word_count_text)
return wordcount.words
def markup_chapters(self, html, wordcount, blanks_between_paragraphs):
# Typical chapters are between 2000 and 7000 words, use the larger number to decide the
# minimum of chapters to search for
self.min_chapters = 1
if wordcount > 7000:
self.min_chapters = int(ceil(wordcount / 7000.))
#print "minimum chapters required are: "+str(self.min_chapters)
heading = re.compile('<h[1-3][^>]*>', re.IGNORECASE)
self.html_preprocess_sections = len(heading.findall(html))
self.log("found " + unicode(self.html_preprocess_sections) + " pre-existing headings")
# Build the Regular Expressions in pieces
init_lookahead = "(?=<(p|div))"
chapter_line_open = "<(?P<outer>p|div)[^>]*>\s*(<(?P<inner1>font|span|[ibu])[^>]*>)?\s*(<(?P<inner2>font|span|[ibu])[^>]*>)?\s*(<(?P<inner3>font|span|[ibu])[^>]*>)?\s*"
title_line_open = "<(?P<outer2>p|div)[^>]*>\s*(<(?P<inner4>font|span|[ibu])[^>]*>)?\s*(<(?P<inner5>font|span|[ibu])[^>]*>)?\s*(<(?P<inner6>font|span|[ibu])[^>]*>)?\s*"
chapter_header_open = r"(?P<chap>"
title_header_open = r"(?P<title>"
chapter_header_close = ")\s*"
title_header_close = ")"
chapter_line_close = "(</(?P=inner3)>)?\s*(</(?P=inner2)>)?\s*(</(?P=inner1)>)?\s*</(?P=outer)>"
title_line_close = "(</(?P=inner6)>)?\s*(</(?P=inner5)>)?\s*(</(?P=inner4)>)?\s*</(?P=outer2)>"
is_pdftohtml = self.is_pdftohtml(html)
if is_pdftohtml:
chapter_line_open = "<(?P<outer>p)[^>]*>(\s*<[ibu][^>]*>)?\s*"
chapter_line_close = "\s*(</[ibu][^>]*>\s*)?</(?P=outer)>"
title_line_open = "<(?P<outer2>p)[^>]*>\s*"
title_line_close = "\s*</(?P=outer2)>"
if blanks_between_paragraphs:
blank_lines = "(\s*<p[^>]*>\s*</p>){0,2}\s*"
else:
blank_lines = ""
opt_title_open = "("
opt_title_close = ")?"
n_lookahead_open = "\s+(?!"
n_lookahead_close = ")"
default_title = r"(<[ibu][^>]*>)?\s{0,3}([\w\'\"-]+\s{0,3}){1,5}?(</[ibu][^>]*>)?(?=<)"
chapter_types = [
[r"[^'\"]?(Introduction|Synopsis|Acknowledgements|Chapter|Kapitel|Epilogue|Volume\s|Prologue|Book\s|Part\s|Dedication|Preface)\s*([\d\w-]+\:?\'?\s*){0,5}", True, "Searching for common Chapter Headings"],
[r"<b[^>]*>\s*(<span[^>]*>)?\s*(?!([*#•]+\s*)+)(\s*(?=[\d.\w#\-*\s]+<)([\d.\w#-*]+\s*){1,5}\s*)(?!\.)(</span>)?\s*</b>", True, "Searching for emphasized lines"], # Emphasized lines
[r"[^'\"]?(\d+(\.|:)|CHAPTER)\s*([\dA-Z\-\'\"#,]+\s*){0,7}\s*", True, "Searching for numeric chapter headings"], # Numeric Chapters
[r"([A-Z]\s+){3,}\s*([\d\w-]+\s*){0,3}\s*", True, "Searching for letter spaced headings"], # Spaced Lettering
[r"[^'\"]?(\d+\.?\s+([\d\w-]+\:?\'?-?\s?){0,5})\s*", True, "Searching for numeric chapters with titles"], # Numeric Titles
[r"[^'\"]?(\d+|CHAPTER)\s*([\dA-Z\-\'\"\?!#,]+\s*){0,7}\s*", True, "Searching for simple numeric chapter headings"], # Numeric Chapters, no dot or colon
[r"\s*[^'\"]?([A-Z#]+(\s|-){0,3}){1,5}\s*", False, "Searching for chapters with Uppercase Characters" ] # Uppercase Chapters
]
# Start with most typical chapter headings, get more aggressive until one works
for [chapter_type, lookahead_ignorecase, log_message] in chapter_types:
if self.html_preprocess_sections >= self.min_chapters:
break
full_chapter_line = chapter_line_open+chapter_header_open+chapter_type+chapter_header_close+chapter_line_close
n_lookahead = re.sub("(ou|in|cha)", "lookahead_", full_chapter_line)
self.log("Marked " + unicode(self.html_preprocess_sections) + " headings, " + log_message)
if lookahead_ignorecase:
chapter_marker = init_lookahead+full_chapter_line+blank_lines+n_lookahead_open+n_lookahead+n_lookahead_close+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close
chapdetect = re.compile(r'%s' % chapter_marker, re.IGNORECASE)
else:
chapter_marker = init_lookahead+full_chapter_line+blank_lines+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close+n_lookahead_open+n_lookahead+n_lookahead_close
chapdetect = re.compile(r'%s' % chapter_marker, re.UNICODE)
html = chapdetect.sub(self.chapter_head, html)
words_per_chptr = wordcount
if words_per_chptr > 0 and self.html_preprocess_sections > 0:
words_per_chptr = wordcount / self.html_preprocess_sections
self.log("Total wordcount is: "+ str(wordcount)+", Average words per section is: "+str(words_per_chptr)+", Marked up "+str(self.html_preprocess_sections)+" chapters")
return html
def punctuation_unwrap(self, length, content, format):
# define the pieces of the regex
lookahead = "(?<=.{"+str(length)+"}([a-zäëïöüàèìòùáćéíóńśúâêîôûçąężıãõñæøþðß,:)\IA\u00DF]|(?<!\&\w{4});))" # (?<!\&\w{4});) is a semicolon not part of an entity
line_ending = "\s*</(span|p|div)>\s*(</(p|span|div)>)?"
blanklines = "\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*"
line_opening = "<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*"
txt_line_wrap = u"((\u0020|\u0009)*\n){1,4}"
unwrap_regex = lookahead+line_ending+blanklines+line_opening
if format == 'txt':
unwrap_regex = lookahead+txt_line_wrap
unwrap = re.compile(u"%s" % unwrap_regex, re.UNICODE)
content = unwrap.sub(' ', content)
return content
def __call__(self, html): def __call__(self, html):
self.log("********* Preprocessing HTML *********") self.log("********* Preprocessing HTML *********")
# Count the words in the document to estimate how many chapters to look for and whether
# other types of processing are attempted
totalwords = 0
totalwords = self.get_word_count(html)
if totalwords < 50:
self.log("not enough text, not preprocessing")
return html
# Arrange line feeds and </p> tags so the line_length and no_markup functions work correctly # Arrange line feeds and </p> tags so the line_length and no_markup functions work correctly
html = re.sub(r"\s*</p>", "</p>\n", html) html = re.sub(r"\s*</(?P<tag>p|div)>", "</"+"\g<tag>"+">\n", html)
html = re.sub(r"\s*<p(?P<style>[^>]*)>\s*", "\n<p"+"\g<style>"+">", html) html = re.sub(r"\s*<(?P<tag>p|div)(?P<style>[^>]*)>\s*", "\n<"+"\g<tag>"+"\g<style>"+">", html)
###### Check Markup ###### ###### Check Markup ######
# #
# some lit files don't have any <p> tags or equivalent (generally just plain text between # some lit files don't have any <p> tags or equivalent (generally just plain text between
# <pre> tags), check and mark up line endings if required before proceeding # <pre> tags), check and mark up line endings if required before proceeding
if self.no_markup(html, 0.1): if self.no_markup(html, 0.1):
self.log("not enough paragraph markers, adding now") self.log("not enough paragraph markers, adding now")
# check if content is in pre tags, use txt processor to mark up if so # check if content is in pre tags, use txt processor to mark up if so
pre = re.compile(r'<pre>', re.IGNORECASE) pre = re.compile(r'<pre>', re.IGNORECASE)
if len(pre.findall(html)) == 1: if len(pre.findall(html)) == 1:
self.log("Running Text Processing") self.log("Running Text Processing")
from calibre.ebooks.txt.processor import convert_basic, preserve_spaces, \ from calibre.ebooks.txt.processor import convert_basic, preserve_spaces, \
separate_paragraphs_single_line separate_paragraphs_single_line
outerhtml = re.compile(r'.*?(?<=<pre>)(?P<text>.*)(?=</pre>).*', re.IGNORECASE|re.DOTALL) outerhtml = re.compile(r'.*?(?<=<pre>)(?P<text>.*)(?=</pre>).*', re.IGNORECASE|re.DOTALL)
html = outerhtml.sub('\g<text>', html) html = outerhtml.sub('\g<text>', html)
html = separate_paragraphs_single_line(html) html = separate_paragraphs_single_line(html)
html = preserve_spaces(html) html = preserve_spaces(html)
html = convert_basic(html, epub_split_size_kb=0) html = convert_basic(html, epub_split_size_kb=0)
else: else:
# Add markup naively # Add markup naively
# TODO - find out if there are cases where there are more than one <pre> tag or # TODO - find out if there are cases where there are more than one <pre> tag or
# other types of unmarked html and handle them in some better fashion # other types of unmarked html and handle them in some better fashion
add_markup = re.compile('(?<!>)(\n)') add_markup = re.compile('(?<!>)(\n)')
html = add_markup.sub('</p>\n<p>', html) html = add_markup.sub('</p>\n<p>', html)
###### Mark Indents/Cleanup ###### ###### Mark Indents/Cleanup ######
# #
@ -141,12 +250,17 @@ class PreProcessor(object):
self.log("replaced "+unicode(self.found_indents)+ " nbsp indents with inline styles") self.log("replaced "+unicode(self.found_indents)+ " nbsp indents with inline styles")
# remove remaining non-breaking spaces # remove remaining non-breaking spaces
html = re.sub(ur'\u00a0', ' ', html) html = re.sub(ur'\u00a0', ' ', html)
# Get rid of various common microsoft specific tags which can cause issues later
# Get rid of empty <o:p> tags to simplify other processing # Get rid of empty <o:p> tags to simplify other processing
html = re.sub(ur'\s*<o:p>\s*</o:p>', ' ', html) html = re.sub(ur'\s*<o:p>\s*</o:p>', ' ', html)
# Delete microsoft 'smart' tags
html = re.sub('(?i)</?st1:\w+>', '', html)
# Get rid of empty span, bold, & italics tags # Get rid of empty span, bold, & italics tags
html = re.sub(r"\s*<span[^>]*>\s*(<span[^>]*>\s*</span>){0,2}\s*</span>\s*", " ", html) html = re.sub(r"\s*<span[^>]*>\s*(<span[^>]*>\s*</span>){0,2}\s*</span>\s*", " ", html)
html = re.sub(r"\s*<[ibu][^>]*>\s*(<[ibu][^>]*>\s*</[ibu]>\s*){0,2}\s*</[ibu]>", " ", html) html = re.sub(r"\s*<[ibu][^>]*>\s*(<[ibu][^>]*>\s*</[ibu]>\s*){0,2}\s*</[ibu]>", " ", html)
html = re.sub(r"\s*<span[^>]*>\s*(<span[^>]>\s*</span>){0,2}\s*</span>\s*", " ", html) html = re.sub(r"\s*<span[^>]*>\s*(<span[^>]>\s*</span>){0,2}\s*</span>\s*", " ", html)
# ADE doesn't render <br />, change to empty paragraphs
#html = re.sub('<br[^>]*>', u'<p>\u00a0</p>', html)
# If more than 40% of the lines are empty paragraphs and the user has enabled remove # If more than 40% of the lines are empty paragraphs and the user has enabled remove
# paragraph spacing then delete blank lines to clean up spacing # paragraph spacing then delete blank lines to clean up spacing
@ -164,63 +278,16 @@ class PreProcessor(object):
self.log("deleting blank lines") self.log("deleting blank lines")
html = blankreg.sub('', html) html = blankreg.sub('', html)
elif float(len(blanklines)) / float(len(lines)) > 0.40: elif float(len(blanklines)) / float(len(lines)) > 0.40:
blanks_between_paragraphs = True blanks_between_paragraphs = True
#print "blanks between paragraphs is marked True" #print "blanks between paragraphs is marked True"
else: else:
blanks_between_paragraphs = False blanks_between_paragraphs = False
#self.dump(html, 'before_chapter_markup') #self.dump(html, 'before_chapter_markup')
# detect chapters/sections to match xpath or splitting logic # detect chapters/sections to match xpath or splitting logic
# #
# Build the Regular Expressions in pieces
init_lookahead = "(?=<(p|div))"
chapter_line_open = "<(?P<outer>p|div)[^>]*>\s*(<(?P<inner1>font|span|[ibu])[^>]*>)?\s*(<(?P<inner2>font|span|[ibu])[^>]*>)?\s*(<(?P<inner3>font|span|[ibu])[^>]*>)?\s*"
title_line_open = "<(?P<outer2>p|div)[^>]*>\s*(<(?P<inner4>font|span|[ibu])[^>]*>)?\s*(<(?P<inner5>font|span|[ibu])[^>]*>)?\s*(<(?P<inner6>font|span|[ibu])[^>]*>)?\s*"
chapter_header_open = r"(?P<chap>"
title_header_open = r"(?P<title>"
chapter_header_close = ")\s*"
title_header_close = ")"
chapter_line_close = "(</(?P=inner3)>)?\s*(</(?P=inner2)>)?\s*(</(?P=inner1)>)?\s*</(?P=outer)>"
title_line_close = "(</(?P=inner6)>)?\s*(</(?P=inner5)>)?\s*(</(?P=inner4)>)?\s*</(?P=outer2)>"
if blanks_between_paragraphs: html = self.markup_chapters(html, totalwords, blanks_between_paragraphs)
blank_lines = "(\s*<p[^>]*>\s*</p>){0,2}\s*"
else:
blank_lines = ""
opt_title_open = "("
opt_title_close = ")?"
n_lookahead_open = "\s+(?!"
n_lookahead_close = ")"
default_title = r"\s{0,3}([\w\'\"-]+\s{0,3}){1,5}?(?=<)"
min_chapters = 10
heading = re.compile('<h[1-3][^>]*>', re.IGNORECASE)
self.html_preprocess_sections = len(heading.findall(html))
self.log("found " + unicode(self.html_preprocess_sections) + " pre-existing headings")
chapter_types = [
[r"[^'\"]?(Introduction|Synopsis|Acknowledgements|Chapter|Kapitel|Epilogue|Volume\s|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,4}", True, "Searching for common Chapter Headings"],
[r"[^'\"]?(\d+\.?|CHAPTER)\s*([\dA-Z\-\'\"\?\.!#,]+\s*){0,7}\s*", True, "Searching for numeric chapter headings"], # Numeric Chapters
[r"<b[^>]*>\s*(<span[^>]*>)?\s*(?!([*#•]+\s*)+)(\s*(?=[\w#\-*\s]+<)([\w#-*]+\s*){1,5}\s*)(</span>)?\s*</b>", True, "Searching for emphasized lines"], # Emphasized lines
[r"[^'\"]?(\d+\.?\s+([\d\w-]+\:?\'?-?\s?){0,5})\s*", True, "Searching for numeric chapters with titles"], # Numeric Titles
[r"\s*[^'\"]?([A-Z#]+(\s|-){0,3}){1,5}\s*", False, "Searching for chapters with Uppercase Characters" ] # Uppercase Chapters
]
# Start with most typical chapter headings, get more aggressive until one works
for [chapter_type, lookahead_ignorecase, log_message] in chapter_types:
if self.html_preprocess_sections >= min_chapters:
break
full_chapter_line = chapter_line_open+chapter_header_open+chapter_type+chapter_header_close+chapter_line_close
n_lookahead = re.sub("(ou|in|cha)", "lookahead_", full_chapter_line)
self.log("Marked " + unicode(self.html_preprocess_sections) + " headings, " + log_message)
if lookahead_ignorecase:
chapter_marker = init_lookahead+full_chapter_line+blank_lines+n_lookahead_open+n_lookahead+n_lookahead_close+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close
chapdetect = re.compile(r'%s' % chapter_marker, re.IGNORECASE)
else:
chapter_marker = init_lookahead+full_chapter_line+blank_lines+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close+n_lookahead_open+n_lookahead+n_lookahead_close
chapdetect = re.compile(r'%s' % chapter_marker, re.UNICODE)
html = chapdetect.sub(self.chapter_head, html)
###### Unwrap lines ###### ###### Unwrap lines ######
@ -247,7 +314,7 @@ class PreProcessor(object):
# Calculate Length # Calculate Length
unwrap_factor = getattr(self.extra_opts, 'html_unwrap_factor', 0.4) unwrap_factor = getattr(self.extra_opts, 'html_unwrap_factor', 0.4)
length = docanalysis.line_length(unwrap_factor) length = docanalysis.line_length(unwrap_factor)
self.log("*** Median line length is " + unicode(length) + ", calculated with " + format + " format ***") self.log("Median line length is " + unicode(length) + ", calculated with " + format + " format")
# only go through unwrapping code if the histogram shows unwrapping is required or if the user decreased the default unwrap_factor # only go through unwrapping code if the histogram shows unwrapping is required or if the user decreased the default unwrap_factor
if hardbreaks or unwrap_factor < 0.4: if hardbreaks or unwrap_factor < 0.4:
self.log("Unwrapping required, unwrapping Lines") self.log("Unwrapping required, unwrapping Lines")
@ -260,8 +327,7 @@ class PreProcessor(object):
self.log("Done dehyphenating") self.log("Done dehyphenating")
# Unwrap lines using punctation and line length # Unwrap lines using punctation and line length
#unwrap_quotes = re.compile(u"(?<=.{%i}\"')\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*(?=[a-z])" % length, re.UNICODE) #unwrap_quotes = re.compile(u"(?<=.{%i}\"')\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*(?=[a-z])" % length, re.UNICODE)
unwrap = re.compile(u"(?<=.{%i}([a-zäëïöüàèìòùáćéíóńśúâêîôûçąężı,:)\IA\u00DF]|(?<!\&\w{4});))\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE) html = self.punctuation_unwrap(length, html, 'html')
html = unwrap.sub(' ', html)
#check any remaining hyphens, but only unwrap if there is a match #check any remaining hyphens, but only unwrap if there is a match
dehyphenator = Dehyphenator() dehyphenator = Dehyphenator()
html = dehyphenator(html,'html_cleanup', length) html = dehyphenator(html,'html_cleanup', length)
@ -276,7 +342,7 @@ class PreProcessor(object):
html = re.sub(u'\xad\s*(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*', '', html) html = re.sub(u'\xad\s*(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*', '', html)
# If still no sections after unwrapping mark split points on lines with no punctuation # If still no sections after unwrapping mark split points on lines with no punctuation
if self.html_preprocess_sections < 5: if self.html_preprocess_sections < self.min_chapters:
self.log("Looking for more split points based on punctuation," self.log("Looking for more split points based on punctuation,"
" currently have " + unicode(self.html_preprocess_sections)) " currently have " + unicode(self.html_preprocess_sections))
chapdetect3 = re.compile(r'<(?P<styles>(p|div)[^>]*)>\s*(?P<section>(<span[^>]*>)?\s*(?!([*#•]+\s*)+)(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*.?(?=[a-z#\-*\s]+<)([a-z#-*]+\s*){1,5}\s*\s*(</span>)?(</[ibu]>){0,2}\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</span>)?\s*</(p|div)>)', re.IGNORECASE) chapdetect3 = re.compile(r'<(?P<styles>(p|div)[^>]*)>\s*(?P<section>(<span[^>]*>)?\s*(?!([*#•]+\s*)+)(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*.?(?=[a-z#\-*\s]+<)([a-z#-*]+\s*){1,5}\s*\s*(</span>)?(</[ibu]>){0,2}\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</span>)?\s*</(p|div)>)', re.IGNORECASE)
@ -291,6 +357,6 @@ class PreProcessor(object):
html = blankreg.sub('\n'+r'\g<openline>'+u'\u00a0'+r'\g<closeline>', html) html = blankreg.sub('\n'+r'\g<openline>'+u'\u00a0'+r'\g<closeline>', html)
# Center separator lines # Center separator lines
html = re.sub(u'<p>\s*(?P<break>([*#•]+\s*)+)\s*</p>', '<p style="text-align:center">' + '\g<break>' + '</p>', html) html = re.sub(u'<(?P<outer>p|div)[^>]*>\s*(<(?P<inner1>font|span|[ibu])[^>]*>)?\s*(<(?P<inner2>font|span|[ibu])[^>]*>)?\s*(<(?P<inner3>font|span|[ibu])[^>]*>)?\s*(?P<break>([*#•]+\s*)+)\s*(</(?P=inner3)>)?\s*(</(?P=inner2)>)?\s*(</(?P=inner1)>)?\s*</(?P=outer)>', '<p style="text-align:center">' + '\g<break>' + '</p>', html)
return html return html

View File

@ -101,6 +101,7 @@ class FB2MLizer(object):
metadata['date'] = '%i.%i.%i' % (datetime.now().day, datetime.now().month, datetime.now().year) metadata['date'] = '%i.%i.%i' % (datetime.now().day, datetime.now().month, datetime.now().year)
metadata['lang'] = u''.join(self.oeb_book.metadata.lang) if self.oeb_book.metadata.lang else 'en' metadata['lang'] = u''.join(self.oeb_book.metadata.lang) if self.oeb_book.metadata.lang else 'en'
metadata['id'] = None metadata['id'] = None
metadata['cover'] = self.get_cover()
author_parts = self.oeb_book.metadata.creator[0].value.split(' ') author_parts = self.oeb_book.metadata.creator[0].value.split(' ')
if len(author_parts) == 1: if len(author_parts) == 1:
@ -123,7 +124,8 @@ class FB2MLizer(object):
metadata['id'] = str(uuid.uuid4()) metadata['id'] = str(uuid.uuid4())
for key, value in metadata.items(): for key, value in metadata.items():
metadata[key] = prepare_string_for_xml(value) if not key == 'cover':
metadata[key] = prepare_string_for_xml(value)
return u'<FictionBook xmlns="http://www.gribuser.ru/xml/fictionbook/2.0" xmlns:xlink="http://www.w3.org/1999/xlink">' \ return u'<FictionBook xmlns="http://www.gribuser.ru/xml/fictionbook/2.0" xmlns:xlink="http://www.w3.org/1999/xlink">' \
'<description>' \ '<description>' \
@ -135,6 +137,7 @@ class FB2MLizer(object):
'<last-name>%(author_last)s</last-name>' \ '<last-name>%(author_last)s</last-name>' \
'</author>' \ '</author>' \
'<book-title>%(title)s</book-title>' \ '<book-title>%(title)s</book-title>' \
'%(cover)s' \
'<lang>%(lang)s</lang>' \ '<lang>%(lang)s</lang>' \
'</title-info>' \ '</title-info>' \
'<document-info>' \ '<document-info>' \
@ -153,6 +156,39 @@ class FB2MLizer(object):
def fb2_footer(self): def fb2_footer(self):
return u'</FictionBook>' return u'</FictionBook>'
def get_cover(self):
cover_href = None
# Get the raster cover if it's available.
if self.oeb_book.metadata.cover and unicode(self.oeb_book.metadata.cover[0]) in self.oeb_book.manifest.ids:
id = unicode(self.oeb_book.metadata.cover[0])
cover_item = self.oeb_book.manifest.ids[id]
if cover_item.media_type in OEB_RASTER_IMAGES:
cover_href = cover_item.href
else:
# Figure out if we have a title page or a cover page
page_name = ''
if 'titlepage' in self.oeb_book.guide:
page_name = 'titlepage'
elif 'cover' in self.oeb_book.guide:
page_name = 'cover'
if page_name:
cover_item = self.oeb_book.manifest.hrefs[self.oeb_book.guide[page_name].href]
# Get the first image in the page
for img in cover_item.xpath('//img'):
cover_href = cover_item.abshref(img.get('src'))
break
if cover_href:
# Only write the image tag if it is in the manifest.
if cover_href in self.oeb_book.manifest.hrefs.keys():
if cover_href not in self.image_hrefs.keys():
self.image_hrefs[cover_href] = '_%s.jpg' % len(self.image_hrefs.keys())
return u'<coverpage><image xlink:href="#%s" /></coverpage>' % self.image_hrefs[cover_href]
return u''
def get_text(self): def get_text(self):
text = ['<body>'] text = ['<body>']
@ -200,8 +236,10 @@ class FB2MLizer(object):
im = Image() im = Image()
im.load(item.data) im.load(item.data)
im.set_compression_quality(70) im.set_compression_quality(70)
data = im.export('jpg') imdata = im.export('jpg')
raw_data = b64encode(data) raw_data = b64encode(imdata)
else:
raw_data = b64encode(item.data)
# Don't put the encoded image on a single line. # Don't put the encoded image on a single line.
data = '' data = ''
col = 1 col = 1

View File

@ -41,17 +41,24 @@ class FB2Input(InputFormatPlugin):
from calibre.ebooks.metadata.opf2 import OPFCreator from calibre.ebooks.metadata.opf2 import OPFCreator
from calibre.ebooks.metadata.meta import get_metadata from calibre.ebooks.metadata.meta import get_metadata
from calibre.ebooks.oeb.base import XLINK_NS, XHTML_NS, RECOVER_PARSER from calibre.ebooks.oeb.base import XLINK_NS, XHTML_NS, RECOVER_PARSER
from calibre.ebooks.chardet import xml_to_unicode
NAMESPACES = {'f':FB2NS, 'l':XLINK_NS} NAMESPACES = {'f':FB2NS, 'l':XLINK_NS}
log.debug('Parsing XML...') log.debug('Parsing XML...')
raw = stream.read().replace('\0', '') raw = stream.read().replace('\0', '')
raw = xml_to_unicode(raw, strip_encoding_pats=True,
assume_utf8=True, resolve_entities=True)[0]
try: try:
doc = etree.fromstring(raw) doc = etree.fromstring(raw)
except etree.XMLSyntaxError: except etree.XMLSyntaxError:
try: try:
doc = etree.fromstring(raw, parser=RECOVER_PARSER) doc = etree.fromstring(raw, parser=RECOVER_PARSER)
if doc is None:
raise Exception('parse failed')
except: except:
doc = etree.fromstring(raw.replace('& ', '&amp;'), doc = etree.fromstring(raw.replace('& ', '&amp;'),
parser=RECOVER_PARSER) parser=RECOVER_PARSER)
if doc is None:
raise ValueError('The FB2 file is not valid XML')
stylesheets = doc.xpath('//*[local-name() = "stylesheet" and @type="text/css"]') stylesheets = doc.xpath('//*[local-name() = "stylesheet" and @type="text/css"]')
css = '' css = ''
for s in stylesheets: for s in stylesheets:
@ -97,13 +104,17 @@ class FB2Input(InputFormatPlugin):
entries = [(f, guess_type(f)[0]) for f in os.listdir('.')] entries = [(f, guess_type(f)[0]) for f in os.listdir('.')]
opf.create_manifest(entries) opf.create_manifest(entries)
opf.create_spine(['index.xhtml']) opf.create_spine(['index.xhtml'])
if mi.cover_data and mi.cover_data[1]:
for img in doc.xpath('//f:coverpage/f:image', namespaces=NAMESPACES): with open('fb2_cover_calibre_mi.jpg', 'wb') as f:
href = img.get('{%s}href'%XLINK_NS, img.get('href', None)) f.write(mi.cover_data[1])
if href is not None: opf.guide.set_cover(os.path.abspath('fb2_cover_calibre_mi.jpg'))
if href.startswith('#'): else:
href = href[1:] for img in doc.xpath('//f:coverpage/f:image', namespaces=NAMESPACES):
opf.guide.set_cover(os.path.abspath(href)) href = img.get('{%s}href'%XLINK_NS, img.get('href', None))
if href is not None:
if href.startswith('#'):
href = href[1:]
opf.guide.set_cover(os.path.abspath(href))
opf.render(open('metadata.opf', 'wb')) opf.render(open('metadata.opf', 'wb'))
return os.path.join(os.getcwd(), 'metadata.opf') return os.path.join(os.getcwd(), 'metadata.opf')

View File

@ -35,7 +35,7 @@ class FB2Output(OutputFormatPlugin):
rasterizer = SVGRasterizer() rasterizer = SVGRasterizer()
rasterizer(oeb_book, opts) rasterizer(oeb_book, opts)
except Unavailable: except Unavailable:
self.log.warn('SVG rasterizer unavailable, SVG will not be converted') log.warn('SVG rasterizer unavailable, SVG will not be converted')
linearize_jacket(oeb_book) linearize_jacket(oeb_book)

View File

@ -119,7 +119,7 @@ class HTMLFile(object):
self.is_binary = level > 0 and not bool(self.HTML_PAT.search(src[:4096])) self.is_binary = level > 0 and not bool(self.HTML_PAT.search(src[:4096]))
if not self.is_binary: if not self.is_binary:
if encoding is None: if not encoding:
encoding = xml_to_unicode(src[:4096], verbose=verbose)[-1] encoding = xml_to_unicode(src[:4096], verbose=verbose)[-1]
self.encoding = encoding self.encoding = encoding
else: else:

View File

@ -16,6 +16,7 @@ from calibre.ebooks.metadata.book import TOP_LEVEL_CLASSIFIERS
from calibre.ebooks.metadata.book import ALL_METADATA_FIELDS from calibre.ebooks.metadata.book import ALL_METADATA_FIELDS
from calibre.library.field_metadata import FieldMetadata from calibre.library.field_metadata import FieldMetadata
from calibre.utils.date import isoformat, format_date from calibre.utils.date import isoformat, format_date
from calibre.utils.icu import sort_key
from calibre.utils.formatter import TemplateFormatter from calibre.utils.formatter import TemplateFormatter
@ -38,15 +39,16 @@ class SafeFormat(TemplateFormatter):
def get_value(self, key, args, kwargs): def get_value(self, key, args, kwargs):
try: try:
key = key.lower()
if key != 'title_sort': if key != 'title_sort':
key = field_metadata.search_term_to_field_key(key.lower()) key = field_metadata.search_term_to_field_key(key)
b = self.book.get_user_metadata(key, False) b = self.book.get_user_metadata(key, False)
if b and b['datatype'] == 'int' and self.book.get(key, 0) == 0: if b and b['datatype'] == 'int' and self.book.get(key, 0) == 0:
v = '' v = ''
elif b and b['datatype'] == 'float' and self.book.get(key, 0.0) == 0.0: elif b and b['datatype'] == 'float' and self.book.get(key, 0.0) == 0.0:
v = '' v = ''
else: else:
ign, v = self.book.format_field(key.lower(), series_with_index=False) ign, v = self.book.format_field(key, series_with_index=False)
if v is None: if v is None:
return '' return ''
if v == '': if v == '':
@ -159,6 +161,11 @@ class Metadata(object):
try: try:
return self.__getattribute__(field) return self.__getattribute__(field)
except AttributeError: except AttributeError:
if field.startswith('#') and field.endswith('_index'):
try:
return self.get_extra(field[:-6])
except:
pass
return default return default
def get_extra(self, field): def get_extra(self, field):
@ -317,14 +324,16 @@ class Metadata(object):
if metadata is None: if metadata is None:
traceback.print_stack() traceback.print_stack()
return return
metadata = copy.deepcopy(metadata) m = {}
if '#value#' not in metadata: for k in metadata:
if metadata['datatype'] == 'text' and metadata['is_multiple']: m[k] = copy.copy(metadata[k])
metadata['#value#'] = [] if '#value#' not in m:
if m['datatype'] == 'text' and m['is_multiple']:
m['#value#'] = []
else: else:
metadata['#value#'] = None m['#value#'] = None
_data = object.__getattribute__(self, '_data') _data = object.__getattribute__(self, '_data')
_data['user_metadata'][field] = metadata _data['user_metadata'][field] = m
def template_to_attribute(self, other, ops): def template_to_attribute(self, other, ops):
''' '''
@ -484,7 +493,7 @@ class Metadata(object):
return authors_to_string(self.authors) return authors_to_string(self.authors)
def format_tags(self): def format_tags(self):
return u', '.join([unicode(t) for t in self.tags]) return u', '.join([unicode(t) for t in sorted(self.tags, key=sort_key)])
def format_rating(self): def format_rating(self):
return unicode(self.rating) return unicode(self.rating)
@ -524,7 +533,7 @@ class Metadata(object):
orig_res = res orig_res = res
datatype = cmeta['datatype'] datatype = cmeta['datatype']
if datatype == 'text' and cmeta['is_multiple']: if datatype == 'text' and cmeta['is_multiple']:
res = u', '.join(res) res = u', '.join(sorted(res, key=sort_key))
elif datatype == 'series' and series_with_index: elif datatype == 'series' and series_with_index:
if self.get_extra(key) is not None: if self.get_extra(key) is not None:
res = res + \ res = res + \
@ -554,7 +563,7 @@ class Metadata(object):
elif key == 'series_index': elif key == 'series_index':
res = self.format_series_index(res) res = self.format_series_index(res)
elif datatype == 'text' and fmeta['is_multiple']: elif datatype == 'text' and fmeta['is_multiple']:
res = u', '.join(res) res = u', '.join(sorted(res, key=sort_key))
elif datatype == 'series' and series_with_index: elif datatype == 'series' and series_with_index:
res = res + ' [%s]'%self.format_series_index() res = res + ' [%s]'%self.format_series_index()
elif datatype == 'datetime': elif datatype == 'datetime':

View File

@ -9,6 +9,7 @@ import mimetypes, os
from base64 import b64decode from base64 import b64decode
from lxml import etree from lxml import etree
from calibre.ebooks.metadata import MetaInformation from calibre.ebooks.metadata import MetaInformation
from calibre.ebooks.chardet import xml_to_unicode
XLINK_NS = 'http://www.w3.org/1999/xlink' XLINK_NS = 'http://www.w3.org/1999/xlink'
def XLINK(name): def XLINK(name):
@ -23,7 +24,10 @@ def get_metadata(stream):
tostring = lambda x : etree.tostring(x, method='text', tostring = lambda x : etree.tostring(x, method='text',
encoding=unicode).strip() encoding=unicode).strip()
parser = etree.XMLParser(recover=True, no_network=True) parser = etree.XMLParser(recover=True, no_network=True)
root = etree.fromstring(stream.read(), parser=parser) raw = stream.read()
raw = xml_to_unicode(raw, strip_encoding_pats=True,
assume_utf8=True)[0]
root = etree.fromstring(raw, parser=parser)
authors, author_sort = [], None authors, author_sort = [], None
for au in XPath('//fb2:author')(root): for au in XPath('//fb2:author')(root):
fname = lname = author = None fname = lname = author = None

View File

@ -17,10 +17,10 @@ BASE_URL = 'http://isbndb.com/api/books.xml?access_key=%(key)s&page_number=1&res
class ISBNDBError(Exception): class ISBNDBError(Exception):
pass pass
def fetch_metadata(url, max=100, timeout=5.): def fetch_metadata(url, max=3, timeout=5.):
books = [] books = []
page_number = 1 page_number = 1
total_results = sys.maxint total_results = 31
br = browser() br = browser()
while len(books) < total_results and max > 0: while len(books) < total_results and max > 0:
try: try:

View File

@ -10,7 +10,8 @@ from calibre.ebooks.metadata import MetaInformation, string_to_authors
title_pat = re.compile(r'\{\\info.*?\{\\title(.*?)(?<!\\)\}', re.DOTALL) title_pat = re.compile(r'\{\\info.*?\{\\title(.*?)(?<!\\)\}', re.DOTALL)
author_pat = re.compile(r'\{\\info.*?\{\\author(.*?)(?<!\\)\}', re.DOTALL) author_pat = re.compile(r'\{\\info.*?\{\\author(.*?)(?<!\\)\}', re.DOTALL)
comment_pat = re.compile(r'\{\\info.*?\{\\subject(.*?)(?<!\\)\}', re.DOTALL) comment_pat = re.compile(r'\{\\info.*?\{\\subject(.*?)(?<!\\)\}', re.DOTALL)
category_pat = re.compile(r'\{\\info.*?\{\\category(.*?)(?<!\\)\}', re.DOTALL) tags_pat = re.compile(r'\{\\info.*?\{\\category(.*?)(?<!\\)\}', re.DOTALL)
publisher_pat = re.compile(r'\{\\info.*?\{\\manager(.*?)(?<!\\)\}', re.DOTALL)
def get_document_info(stream): def get_document_info(stream):
""" """
@ -82,61 +83,73 @@ def decode(raw, codec):
def get_metadata(stream): def get_metadata(stream):
""" Return metadata as a L{MetaInfo} object """ """ Return metadata as a L{MetaInfo} object """
title, author, comment, category = None, None, None, None
stream.seek(0) stream.seek(0)
if stream.read(5) != r'{\rtf': if stream.read(5) != r'{\rtf':
return MetaInformation(None, None) return MetaInformation(_('Unknown'))
block = get_document_info(stream)[0] block = get_document_info(stream)[0]
if not block: if not block:
return MetaInformation(None, None) return MetaInformation(_('Unknown'))
stream.seek(0) stream.seek(0)
cpg = detect_codepage(stream) cpg = detect_codepage(stream)
stream.seek(0) stream.seek(0)
title_match = title_pat.search(block) title_match = title_pat.search(block)
if title_match: if title_match is not None:
title = decode(title_match.group(1).strip(), cpg) title = decode(title_match.group(1).strip(), cpg)
else:
title = _('Unknown')
author_match = author_pat.search(block) author_match = author_pat.search(block)
if author_match: if author_match is not None:
author = decode(author_match.group(1).strip(), cpg) author = decode(author_match.group(1).strip(), cpg)
comment_match = comment_pat.search(block) else:
if comment_match: author = None
comment = decode(comment_match.group(1).strip(), cpg) mi = MetaInformation(title)
category_match = category_pat.search(block)
if category_match:
category = decode(category_match.group(1).strip(), cpg)
mi = MetaInformation(title, author)
if author: if author:
mi.authors = string_to_authors(author) mi.authors = string_to_authors(author)
mi.comments = comment
mi.category = category comment_match = comment_pat.search(block)
if comment_match is not None:
comment = decode(comment_match.group(1).strip(), cpg)
mi.comments = comment
tags_match = tags_pat.search(block)
if tags_match is not None:
tags = decode(tags_match.group(1).strip(), cpg)
mi.tags = tags
publisher_match = publisher_pat.search(block)
if publisher_match is not None:
publisher = decode(publisher_match.group(1).strip(), cpg)
mi.publisher = publisher
return mi return mi
def create_metadata(stream, options): def create_metadata(stream, options):
md = r'{\info' md = [r'{\info']
if options.title: if options.title:
title = options.title.encode('ascii', 'ignore') title = options.title.encode('ascii', 'ignore')
md += r'{\title %s}'%(title,) md.append(r'{\title %s}'%(title,))
if options.authors: if options.authors:
au = options.authors au = options.authors
if not isinstance(au, basestring): if not isinstance(au, basestring):
au = u', '.join(au) au = u', '.join(au)
author = au.encode('ascii', 'ignore') author = au.encode('ascii', 'ignore')
md += r'{\author %s}'%(author,) md.append(r'{\author %s}'%(author,))
if options.get('category', None):
category = options.category.encode('ascii', 'ignore')
md += r'{\category %s}'%(category,)
comp = options.comment if hasattr(options, 'comment') else options.comments comp = options.comment if hasattr(options, 'comment') else options.comments
if comp: if comp:
comment = comp.encode('ascii', 'ignore') comment = comp.encode('ascii', 'ignore')
md += r'{\subject %s}'%(comment,) md.append(r'{\subject %s}'%(comment,))
if len(md) > 6: if options.publisher:
md += '}' publisher = options.publisher.encode('ascii', 'ignore')
md.append(r'{\manager %s}'%(publisher,))
if options.tags:
tags = u', '.join(options.tags)
tags = tags.encode('ascii', 'ignore')
md.append(r'{\category %s}'%(tags,))
if len(md) > 1:
md.append('}')
stream.seek(0) stream.seek(0)
src = stream.read() src = stream.read()
ans = src[:6] + md + src[6:] ans = src[:6] + u''.join(md) + src[6:]
stream.seek(0) stream.seek(0)
stream.write(ans) stream.write(ans)
@ -156,7 +169,7 @@ def set_metadata(stream, options):
base_pat = r'\{\\name(.*?)(?<!\\)\}' base_pat = r'\{\\name(.*?)(?<!\\)\}'
title = options.title title = options.title
if title != None: if title is not None:
title = title.encode('ascii', 'replace') title = title.encode('ascii', 'replace')
pat = re.compile(base_pat.replace('name', 'title'), re.DOTALL) pat = re.compile(base_pat.replace('name', 'title'), re.DOTALL)
if pat.search(src): if pat.search(src):
@ -164,7 +177,7 @@ def set_metadata(stream, options):
else: else:
src = add_metadata_item(src, 'title', title) src = add_metadata_item(src, 'title', title)
comment = options.comments comment = options.comments
if comment != None: if comment is not None:
comment = comment.encode('ascii', 'replace') comment = comment.encode('ascii', 'replace')
pat = re.compile(base_pat.replace('name', 'subject'), re.DOTALL) pat = re.compile(base_pat.replace('name', 'subject'), re.DOTALL)
if pat.search(src): if pat.search(src):
@ -172,7 +185,7 @@ def set_metadata(stream, options):
else: else:
src = add_metadata_item(src, 'subject', comment) src = add_metadata_item(src, 'subject', comment)
author = options.authors author = options.authors
if author != None: if author is not None:
author = ', '.join(author) author = ', '.join(author)
author = author.encode('ascii', 'ignore') author = author.encode('ascii', 'ignore')
pat = re.compile(base_pat.replace('name', 'author'), re.DOTALL) pat = re.compile(base_pat.replace('name', 'author'), re.DOTALL)
@ -180,14 +193,23 @@ def set_metadata(stream, options):
src = pat.sub(r'{\\author ' + author + r'}', src) src = pat.sub(r'{\\author ' + author + r'}', src)
else: else:
src = add_metadata_item(src, 'author', author) src = add_metadata_item(src, 'author', author)
category = options.get('category', None) tags = options.tags
if category != None: if tags is not None:
category = category.encode('ascii', 'replace') tags = ', '.join(tags)
tags = tags.encode('ascii', 'replace')
pat = re.compile(base_pat.replace('name', 'category'), re.DOTALL) pat = re.compile(base_pat.replace('name', 'category'), re.DOTALL)
if pat.search(src): if pat.search(src):
src = pat.sub(r'{\\category ' + category + r'}', src) src = pat.sub(r'{\\category ' + tags + r'}', src)
else: else:
src = add_metadata_item(src, 'category', category) src = add_metadata_item(src, 'category', tags)
publisher = options.publisher
if publisher is not None:
publisher = publisher.encode('ascii', 'replace')
pat = re.compile(base_pat.replace('name', 'manager'), re.DOTALL)
if pat.search(src):
src = pat.sub(r'{\\manager ' + publisher + r'}', src)
else:
src = add_metadata_item(src, 'manager', publisher)
stream.seek(pos + olen) stream.seek(pos + olen)
after = stream.read() after = stream.read()
stream.seek(pos) stream.seek(pos)

View File

@ -18,7 +18,6 @@ class xISBN(object):
self._data = [] self._data = []
self._map = {} self._map = {}
self.br = browser()
self.isbn_pat = re.compile(r'[^0-9X]', re.IGNORECASE) self.isbn_pat = re.compile(r'[^0-9X]', re.IGNORECASE)
def purify(self, isbn): def purify(self, isbn):
@ -26,7 +25,7 @@ class xISBN(object):
def fetch_data(self, isbn): def fetch_data(self, isbn):
url = self.QUERY%isbn url = self.QUERY%isbn
data = self.br.open_novisit(url).read() data = browser().open_novisit(url).read()
data = json.loads(data) data = json.loads(data)
if data.get('stat', None) != 'ok': if data.get('stat', None) != 'ok':
return [] return []

Some files were not shown because too many files have changed in this diff Show More