Sync to trunk.

This commit is contained in:
John Schember 2011-11-23 19:34:52 -05:00
commit e32bdc0ce9
27 changed files with 1302 additions and 554 deletions

View File

@ -1,5 +1,5 @@
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2010, Dean Cording' __copyright__ = '2011, Pat Stapleton <pat.stapleton at gmail.com>'
''' '''
abc.net.au/news abc.net.au/news
''' '''
@ -8,7 +8,7 @@ from calibre.web.feeds.recipes import BasicNewsRecipe
class ABCNews(BasicNewsRecipe): class ABCNews(BasicNewsRecipe):
title = 'ABC News' title = 'ABC News'
__author__ = 'Dean Cording' __author__ = 'Pat Stapleton, Dean Cording'
description = 'News from Australia' description = 'News from Australia'
masthead_url = 'http://www.abc.net.au/news/assets/v5/images/common/logo-news.png' masthead_url = 'http://www.abc.net.au/news/assets/v5/images/common/logo-news.png'
cover_url = 'http://www.abc.net.au/news/assets/v5/images/common/logo-news.png' cover_url = 'http://www.abc.net.au/news/assets/v5/images/common/logo-news.png'
@ -23,7 +23,9 @@ class ABCNews(BasicNewsRecipe):
category = 'News, Australia, World' category = 'News, Australia, World'
language = 'en_AU' language = 'en_AU'
publication_type = 'newsportal' publication_type = 'newsportal'
preprocess_regexps = [(re.compile(r'<!--.*?-->', re.DOTALL), lambda m: '')] # preprocess_regexps = [(re.compile(r'<!--.*?-->', re.DOTALL), lambda m: '')]
#Remove annoying map links (inline-caption class is also used for some image captions! hence regex to match maps.google)
preprocess_regexps = [(re.compile(r'<a class="inline-caption" href="http://maps\.google\.com.*?/a>', re.DOTALL), lambda m: '')]
conversion_options = { conversion_options = {
'comments' : description 'comments' : description
,'tags' : category ,'tags' : category
@ -32,23 +34,23 @@ class ABCNews(BasicNewsRecipe):
,'linearize_tables': False ,'linearize_tables': False
} }
keep_only_tags = dict(id='article') keep_only_tags = [dict(attrs={'class':['article section']})]
remove_tags = [dict(attrs={'class':['related', 'tags']}), remove_tags = [dict(attrs={'class':['related', 'tags', 'tools', 'attached-content ready',
dict(id='statepromo') 'inline-content story left', 'inline-content map left contracted', 'published',
] 'story-map', 'statepromo', 'topics', ]})]
remove_attributes = ['width','height'] remove_attributes = ['width','height']
feeds = [ feeds = [
('Top Stories', 'http://www.abc.net.au/news/syndicate/topstoriesrss.xml'), ('Top Stories', 'http://www.abc.net.au/news/feed/45910/rss.xml'),
('Canberra', 'http://www.abc.net.au/news/indexes/idx-act/rss.xml'), ('Canberra', 'http://www.abc.net.au/news/feed/6910/rss.xml'),
('Sydney', 'http://www.abc.net.au/news/indexes/sydney/rss.xml'), ('Sydney', 'http://www.abc.net.au/news/feed/10232/rss.xml'),
('Melbourne', 'http://www.abc.net.au/news/indexes/melbourne/rss.xml'), ('Melbourne', 'http://www.abc.net.au/news/feed/21708/rss.xml'),
('Brisbane', 'http://www.abc.net.au/news/indexes/brisbane/rss.xml'), ('Brisbane', 'http://www.abc.net.au/news/feed/12858/rss.xml'),
('Perth', 'http://www.abc.net.au/news/indexes/perth/rss.xml'), ('Perth', 'feed://www.abc.net.au/news/feed/24886/rss.xml'),
('Australia', 'http://www.abc.net.au/news/indexes/idx-australia/rss.xml'), ('Australia', 'http://www.abc.net.au/news/feed/46182/rss.xml'),
('World', 'http://www.abc.net.au/news/indexes/world/rss.xml'), ('World', 'http://www.abc.net.au/news/feed/52278/rss.xml'),
('Business', 'http://www.abc.net.au/news/indexes/business/rss.xml'), ('Business', 'http://www.abc.net.au/news/feed/51892/rss.xml'),
('Science and Technology', 'http://www.abc.net.au/news/tag/science-and-technology/rss.xml'), ('Science and Technology', 'http://www.abc.net.au/news/feed/2298/rss.xml'),
] ]

View File

@ -1,61 +1,648 @@
__license__ = 'GPL v3' ##
__copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>' ## Title: BBC News, Sport, and Blog Calibre Recipe
## Contact: mattst - jmstanfield@gmail.com
##
## License: GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html
## Copyright: mattst - jmstanfield@gmail.com
##
## Written: November 2011
## Last Edited: 2011-11-19
##
__license__ = 'GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html'
__copyright__ = 'mattst - jmstanfield@gmail.com'
''' '''
news.bbc.co.uk BBC News, Sport, and Blog Calibre Recipe
''' '''
# Import the regular expressions module.
import re import re
# Import the BasicNewsRecipe class which this class extends.
from calibre.web.feeds.recipes import BasicNewsRecipe from calibre.web.feeds.recipes import BasicNewsRecipe
class BBC(BasicNewsRecipe): class BBCNewsSportBlog(BasicNewsRecipe):
title = 'BBC News'
__author__ = 'Darko Miletic, Starson17' #
description = 'News from UK. ' # **** IMPORTANT USERS READ ME ****
oldest_article = 2 #
max_articles_per_feed = 100 # First select the feeds you want then scroll down below the feeds list
no_stylesheets = True # and select the values you want for the other user preferences, like
#delay = 1 # oldest_article and such like.
use_embedded_content = False #
encoding = 'utf8' #
publisher = 'BBC' # Select the BBC rss feeds which you want in your ebook.
category = 'news, UK, world' # Selected feed have NO '#' at their start, de-selected feeds begin with a '#'.
language = 'en_GB' #
publication_type = 'newsportal' # Eg. ("News Home", "http://feeds.bbci.co.uk/... - include feed.
extra_css = ' body{ font-family: Verdana,Helvetica,Arial,sans-serif } .introduction{font-weight: bold} .story-feature{display: block; padding: 0; border: 1px solid; width: 40%; font-size: small} .story-feature h2{text-align: center; text-transform: uppercase} ' # Eg. #("News Home", "http://feeds.bbci.co.uk/... - do not include feed.
preprocess_regexps = [(re.compile(r'<!--.*?-->', re.DOTALL), lambda m: '')] #
conversion_options = { # There are 68 feeds below which constitute the bulk of the available rss
'comments' : description # feeds on the BBC web site. These include 5 blogs by editors and
,'tags' : category # correspondants, 16 sports feeds, 15 'sub' regional feeds (Eg. North West
,'language' : language # Wales, Scotland Business), and 7 Welsh language feeds.
,'publisher' : publisher #
,'linearize_tables': True # Some of the feeds are low volume (Eg. blogs), or very low volume (Eg. Click)
# so if "oldest_article = 1.5" (only articles published in the last 36 hours)
# you may get some 'empty feeds' which will not then be included in the ebook.
#
# The 15 feeds currently selected below are simply my default ones.
#
# Note: With all 68 feeds selected, oldest_article set to 2,
# max_articles_per_feed set to 100, and simultaneous_downloads set to 10,
# the ebook creation took 29 minutes on my speedy 100 mbps net connection,
# fairly high-end desktop PC running Linux (Ubuntu Lucid-Lynx).
# More realistically with 15 feeds selected, oldest_article set to 1.5,
# max_articles_per_feed set to 100, and simultaneous_downloads set to 20,
# it took 6 minutes. If that's too slow increase 'simultaneous_downloads'.
#
# Select / de-select the feeds you want in your ebook.
#
feeds = [
("News Home", "http://feeds.bbci.co.uk/news/rss.xml"),
("UK", "http://feeds.bbci.co.uk/news/uk/rss.xml"),
("World", "http://feeds.bbci.co.uk/news/world/rss.xml"),
#("England", "http://feeds.bbci.co.uk/news/england/rss.xml"),
#("Scotland", "http://feeds.bbci.co.uk/news/scotland/rss.xml"),
#("Wales", "http://feeds.bbci.co.uk/news/wales/rss.xml"),
#("N. Ireland", "http://feeds.bbci.co.uk/news/northern_ireland/rss.xml"),
#("Africa", "http://feeds.bbci.co.uk/news/world/africa/rss.xml"),
#("Asia", "http://feeds.bbci.co.uk/news/world/asia/rss.xml"),
#("Europe", "http://feeds.bbci.co.uk/news/world/europe/rss.xml"),
#("Latin America", "http://feeds.bbci.co.uk/news/world/latin_america/rss.xml"),
#("Middle East", "http://feeds.bbci.co.uk/news/world/middle_east/rss.xml"),
("US & Canada", "http://feeds.bbci.co.uk/news/world/us_and_canada/rss.xml"),
("Politics", "http://feeds.bbci.co.uk/news/politics/rss.xml"),
("Science/Environment", "http://feeds.bbci.co.uk/news/science_and_environment/rss.xml"),
("Technology", "http://feeds.bbci.co.uk/news/technology/rss.xml"),
("Magazine", "http://feeds.bbci.co.uk/news/magazine/rss.xml"),
("Entertainment/Arts", "http://feeds.bbci.co.uk/news/entertainment_and_arts/rss.xml"),
#("Health", "http://feeds.bbci.co.uk/news/health/rss.xml"),
#("Education/Family", "http://feeds.bbci.co.uk/news/education/rss.xml"),
("Business", "http://feeds.bbci.co.uk/news/business/rss.xml"),
("Special Reports", "http://feeds.bbci.co.uk/news/special_reports/rss.xml"),
("Also in the News", "http://feeds.bbci.co.uk/news/also_in_the_news/rss.xml"),
#("Newsbeat", "http://www.bbc.co.uk/newsbeat/rss.xml"),
#("Click", "http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/programmes/click_online/rss.xml"),
("Blog: Nick Robinson (Political Editor)", "http://feeds.bbci.co.uk/news/correspondents/nickrobinson/rss.sxml"),
#("Blog: Mark D'Arcy (Parliamentary Correspondent)", "http://feeds.bbci.co.uk/news/correspondents/markdarcy/rss.sxml"),
#("Blog: Robert Peston (Business Editor)", "http://feeds.bbci.co.uk/news/correspondents/robertpeston/rss.sxml"),
#("Blog: Stephanie Flanders (Economics Editor)", "http://feeds.bbci.co.uk/news/correspondents/stephanieflanders/rss.sxml"),
("Blog: Rory Cellan-Jones (Technology correspondent)", "http://feeds.bbci.co.uk/news/correspondents/rorycellanjones/rss.sxml"),
("Sport Front Page", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/front_page/rss.xml"),
#("Football", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/football/rss.xml"),
#("Cricket", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/cricket/rss.xml"),
#("Rugby Union", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/rugby_union/rss.xml"),
#("Rugby League", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/rugby_league/rss.xml"),
#("Tennis", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/tennis/rss.xml"),
#("Golf", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/golf/rss.xml"),
#("Motorsport", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/motorsport/rss.xml"),
#("Boxing", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/boxing/rss.xml"),
#("Athletics", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/athletics/rss.xml"),
#("Snooker", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/other_sports/snooker/rss.xml"),
#("Horse Racing", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/other_sports/horse_racing/rss.xml"),
#("Cycling", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/other_sports/cycling/rss.xml"),
#("Disability Sport", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/other_sports/disability_sport/rss.xml"),
#("Other Sport", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/other_sports/rss.xml"),
#("Olympics 2012", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/other_sports/olympics_2012/rss.xml"),
#("N. Ireland Politics", "http://feeds.bbci.co.uk/news/northern_ireland/northern_ireland_politics/rss.xml"),
#("Scotland Politics", "http://feeds.bbci.co.uk/news/scotland/scotland_politics/rss.xml"),
#("Scotland Business", "http://feeds.bbci.co.uk/news/scotland/scotland_business/rss.xml"),
#("E. Scotland, Edinburgh & Fife", "http://feeds.bbci.co.uk/news/scotland/edinburgh_east_and_fife/rss.xml"),
#("W. Scotland & Glasgow", "http://feeds.bbci.co.uk/news/scotland/glasgow_and_west/rss.xml"),
#("Highlands & Islands", "http://feeds.bbci.co.uk/news/scotland/highlands_and_islands/rss.xml"),
#("NE. Scotland, Orkney & Shetland", "http://feeds.bbci.co.uk/news/scotland/north_east_orkney_and_shetland/rss.xml"),
#("South Scotland", "http://feeds.bbci.co.uk/news/scotland/south_scotland/rss.xml"),
#("Central Scotland & Tayside", "http://feeds.bbci.co.uk/news/scotland/tayside_and_central/rss.xml"),
#("Wales Politics", "http://feeds.bbci.co.uk/news/wales/wales_politics/rss.xml"),
#("NW. Wales", "http://feeds.bbci.co.uk/news/wales/north_west_wales/rss.xml"),
#("NE. Wales", "http://feeds.bbci.co.uk/news/wales/north_east_wales/rss.xml"),
#("Mid. Wales", "http://feeds.bbci.co.uk/news/wales/mid_wales/rss.xml"),
#("SW. Wales", "http://feeds.bbci.co.uk/news/wales/south_west_wales/rss.xml"),
#("SE. Wales", "http://feeds.bbci.co.uk/news/wales/south_east_wales/rss.xml"),
#("Newyddion - News in Welsh", "http://feeds.bbci.co.uk/newyddion/rss.xml"),
#("Gwleidyddiaeth", "http://feeds.bbci.co.uk/newyddion/gwleidyddiaeth/rss.xml"),
#("Gogledd-Ddwyrain", "http://feeds.bbci.co.uk/newyddion/gogledd-ddwyrain/rss.xml"),
#("Gogledd-Orllewin", "http://feeds.bbci.co.uk/newyddion/gogledd-orllewin/rss.xml"),
#("Canolbarth", "http://feeds.bbci.co.uk/newyddion/canolbarth/rss.xml"),
#("De-Ddwyrain", "http://feeds.bbci.co.uk/newyddion/de-ddwyrain/rss.xml"),
#("De-Orllewin", "http://feeds.bbci.co.uk/newyddion/de-orllewin/rss.xml"),
]
# **** SELECT YOUR USER PREFERENCES ****
# Title to use for the ebook.
#
title = 'BBC News'
# A brief description for the ebook.
#
description = u'BBC web site ebook created using rss feeds.'
# The max number of articles which may be downloaded from each feed.
# I've never seen more than about 70 articles in a single feed in the
# BBC feeds.
#
max_articles_per_feed = 100
# The max age of articles which may be downloaded from each feed. This is
# specified in days - note fractions of days are allowed, Eg. 2.5 (2 and a
# half days). My default of 1.5 days is the last 36 hours, the point at
# which I've decided 'news' becomes 'old news', but be warned this is not
# so good for the blogs, technology, magazine, etc., and sports feeds.
# You may wish to extend this to 2-5 but watch out ebook creation time will
# increase as well. Setting this to 30 will get everything (AFAICT) as long
# as max_articles_per_feed remains set high (except for 'Click' which is
# v. low volume and its currently oldest article is 4th Feb 2011).
#
oldest_article = 1.5
# Number of simultaneous downloads. 20 is consistantly working fine on the
# BBC News feeds with no problems. Speeds things up from the defualt of 5.
# If you have a lot of feeds and/or have increased oldest_article above 2
# then you may wish to try increasing simultaneous_downloads to 25-30,
# Or, of course, if you are in a hurry. [I've not tried beyond 20.]
#
simultaneous_downloads = 20
# Timeout for fetching files from the server in seconds. The default of
# 120 seconds, seems somewhat excessive.
#
timeout = 30
# The format string for the date shown on the ebook's first page.
# List of all values: http://docs.python.org/library/time.html
# Default in news.py has a leading space so that's mirrored here.
# As with 'feeds' select/de-select by adding/removing the initial '#',
# only one timefmt should be selected, here's a few to choose from.
#
timefmt = ' [%a, %d %b %Y]' # [Fri, 14 Nov 2011] (Calibre default)
#timefmt = ' [%a, %d %b %Y %H:%M]' # [Fri, 14 Nov 2011 18:30]
#timefmt = ' [%a, %d %b %Y %I:%M %p]' # [Fri, 14 Nov 2011 06:30 PM]
#timefmt = ' [%d %b %Y]' # [14 Nov 2011]
#timefmt = ' [%d %b %Y %H:%M]' # [14 Nov 2011 18.30]
#timefmt = ' [%Y-%m-%d]' # [2011-11-14]
#timefmt = ' [%Y-%m-%d-%H-%M]' # [2011-11-14-18-30]
#
# **** IMPORTANT ****
#
# DO NOT EDIT BELOW HERE UNLESS YOU KNOW WHAT YOU ARE DOING.
#
# DO NOT EDIT BELOW HERE UNLESS YOU KNOW WHAT YOU ARE DOING.
#
# I MEAN IT, YES I DO, ABSOLUTELY, AT YOU OWN RISK. :)
#
# **** IMPORTANT ****
#
# Author of this recipe.
__author__ = 'mattst'
# Specify English as the language of the RSS feeds (ISO-639 code).
language = 'en_GB'
# Set tags.
tags = 'news, sport, blog'
# Set publisher and publication type.
publisher = 'BBC'
publication_type = 'newspaper'
# Disable stylesheets from site.
no_stylesheets = True
# Specifies an override encoding for sites that have an incorrect charset
# specified. Default of 'None' says to auto-detect. Some other BBC recipes
# use 'utf8', which works fine (so use that if necessary) but auto-detecting
# with None is working fine, so stick with that for robustness.
encoding = None
# Sets whether a feed has full articles embedded in it. The BBC feeds do not.
use_embedded_content = False
# Removes empty feeds - why keep them!?
remove_empty_feeds = True
# Create a custom title which fits nicely in the Kindle title list.
# Requires "import time" above class declaration, and replacing
# title with custom_title in conversion_options (right column only).
# Example of string below: "BBC News - 14 Nov 2011"
#
# custom_title = "BBC News - " + time.strftime('%d %b %Y')
'''
# Conversion options for advanced users, but don't forget to comment out the
# current conversion_options below. Avoid setting 'linearize_tables' as that
# plays havoc with the 'old style' table based pages.
#
conversion_options = { 'title' : title,
'comments' : description,
'tags' : tags,
'language' : language,
'publisher' : publisher,
'authors' : publisher,
'smarten_punctuation' : True
} }
'''
keep_only_tags = [ conversion_options = { 'smarten_punctuation' : True }
dict(name='div', attrs={'class':['layout-block-a layout-block']})
,dict(attrs={'class':['story-body','storybody']})
]
remove_tags = [ # Specify extra CSS - overrides ALL other CSS (IE. Added last).
dict(name='div', attrs={'class':['story-feature related narrow', 'share-help', 'embedded-hyper', extra_css = 'body { font-family: verdana, helvetica, sans-serif; } \
'story-feature wide ', 'story-feature narrow']}), .introduction, .first { font-weight: bold; } \
dict(id=['hypertab', 'comment-form']), .cross-head { font-weight: bold; font-size: 125%; } \
] .cap, .caption { display: block; font-size: 80%; font-style: italic; } \
.cap, .caption, .caption img, .caption span { display: block; text-align: center; margin: 5px auto; } \
.byl, .byd, .byline img, .byline-name, .byline-title, .author-name, .author-position, \
.correspondent-portrait img, .byline-lead-in, .name, .bbc-role { display: block; \
text-align: center; font-size: 80%; font-style: italic; margin: 1px auto; } \
.story-date, .published { font-size: 80%; } \
table { width: 100%; } \
td img { display: block; margin: 5px auto; } \
ul { padding-top: 10px; } \
ol { padding-top: 10px; } \
li { padding-top: 5px; padding-bottom: 5px; } \
h1 { text-align: center; font-size: 175%; font-weight: bold; } \
h2 { text-align: center; font-size: 150%; font-weight: bold; } \
h3 { text-align: center; font-size: 125%; font-weight: bold; } \
h4, h5, h6 { text-align: center; font-size: 100%; font-weight: bold; }'
remove_attributes = ['width','height'] # Remove various tag attributes to improve the look of the ebook pages.
remove_attributes = [ 'border', 'cellspacing', 'align', 'cellpadding', 'colspan',
'valign', 'vspace', 'hspace', 'alt', 'width', 'height' ]
feeds = [ # Remove the (admittedly rarely used) line breaks, "<br />", which sometimes
('News Front Page', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/front_page/rss.xml'), # cause a section of the ebook to start in an unsightly fashion or, more
('Science/Nature', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/science/nature/rss.xml'), # frequently, a "<br />" will muck up the formatting of a correspondant's byline.
('Technology', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/technology/rss.xml'), # "<br />" and "<br clear/>" are far more frequently used on the table formatted
('Entertainment', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/entertainment/rss.xml'), # style of pages, and really spoil the look of the ebook pages.
('Magazine', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/uk_news/magazine/rss.xml'), preprocess_regexps = [(re.compile(r'<br[ ]*/>', re.IGNORECASE), lambda m: ''),
('Business', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/business/rss.xml'), (re.compile(r'<br[ ]*clear.*/>', re.IGNORECASE), lambda m: '')]
('Health', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/health/rss.xml'),
('Americas', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/americas/rss.xml'),
('Europe', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/europe/rss.xml'),
('South Asia', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/south_asia/rss.xml'),
('UK', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/uk_news/rss.xml'),
('Asia-Pacific', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/asia-pacific/rss.xml'),
('Africa', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/africa/rss.xml'),
]
# Create regular expressions for tag keeping and removal to make the matches more
# robust against minor changes and errors in the HTML, Eg. double spaces, leading
# and trailing spaces, missing hyphens, and such like.
# Python regular expression ('re' class) page: http://docs.python.org/library/re.html
# ***************************************
# Regular expressions for keep_only_tags:
# ***************************************
# The BBC News HTML pages use variants of 'storybody' to denote the section of a HTML
# page which contains the main text of the article. Match storybody variants: 'storybody',
# 'story-body', 'story body','storybody ', etc.
storybody_reg_exp = '^.*story[_ -]*body.*$'
# The BBC sport and 'newsbeat' (features) HTML pages use 'blq_content' to hold the title
# and published date. This is one level above the usual news pages which have the title
# and date within 'story-body'. This is annoying since 'blq_content' must also be kept,
# resulting in a lot of extra things to be removed by remove_tags.
blq_content_reg_exp = '^.*blq[_ -]*content.*$'
# The BBC has an alternative page design structure, which I suspect is an out-of-date
# design but which is still used in some articles, Eg. 'Click' (technology), 'FastTrack'
# (travel), and in some sport pages. These alternative pages are table based (which is
# why I think they are an out-of-date design) and account for -I'm guesstimaking- less
# than 1% of all articles. They use a table class 'storycontent' to hold the article
# and like blq_content (above) have required lots of extra removal by remove_tags.
story_content_reg_exp = '^.*story[_ -]*content.*$'
# Keep the sections of the HTML which match the list below. The HTML page created by
# Calibre will fill <body> with those sections which are matched. Note that the
# blq_content_reg_exp must be listed before storybody_reg_exp in keep_only_tags due to
# it being the parent of storybody_reg_exp, that is to say the div class/id 'story-body'
# will be inside div class/id 'blq_content' in the HTML (if 'blq_content' is there at
# all). If they are the other way around in keep_only_tags then blq_content_reg_exp
# will end up being discarded.
keep_only_tags = [ dict(name='table', attrs={'class':re.compile(story_content_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'class':re.compile(blq_content_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'id':re.compile(blq_content_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'class':re.compile(storybody_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'id':re.compile(storybody_reg_exp, re.IGNORECASE)}) ]
# ************************************
# Regular expressions for remove_tags:
# ************************************
# Regular expression to remove share-help and variant tags. The share-help class
# is used by the site for a variety of 'sharing' type links, Eg. Facebook, delicious,
# twitter, email. Removed to avoid page clutter.
share_help_reg_exp = '^.*share[_ -]*help.*$'
# Regular expression to remove embedded-hyper and variant tags. This class is used to
# display links to other BBC News articles on the same/similar subject.
embedded_hyper_reg_exp = '^.*embed*ed[_ -]*hyper.*$'
# Regular expression to remove hypertabs and variant tags. This class is used to
# display a tab bar at the top of an article which allows the user to switch to
# an article (viewed on the same page) providing further info., 'in depth' analysis,
# an editorial, a correspondant's blog entry, and such like. The ability to handle
# a tab bar of this nature is currently beyond the scope of this recipe and
# possibly of Calibre itself (not sure about that - TO DO - check!).
hypertabs_reg_exp = '^.*hyper[_ -]*tabs.*$'
# Regular expression to remove story-feature and variant tags. Eg. 'story-feature',
# 'story-feature related narrow', 'story-feature wide', 'story-feature narrow'.
# This class is used to add additional info. boxes, or small lists, outside of
# the main story. TO DO: Work out a way to incorporate these neatly.
story_feature_reg_exp = '^.*story[_ -]*feature.*$'
# Regular expression to remove video and variant tags, Eg. 'videoInStoryB',
# 'videoInStoryC'. This class is used to embed video.
video_reg_exp = '^.*video.*$'
# Regular expression to remove audio and variant tags, Eg. 'audioInStoryD'.
# This class is used to embed audio.
audio_reg_exp = '^.*audio.*$'
# Regular expression to remove pictureGallery and variant tags, Eg. 'pictureGallery'.
# This class is used to embed a photo slideshow. See also 'slideshow' below.
picture_gallery_reg_exp = '^.*picture.*$'
# Regular expression to remove slideshow and variant tags, Eg. 'dslideshow-enclosure'.
# This class is used to embed a slideshow (not necessarily photo) but both
# 'slideshow' and 'pictureGallery' are used for slideshows.
slideshow_reg_exp = '^.*slide[_ -]*show.*$'
# Regular expression to remove social-links and variant tags. This class is used to
# display links to a BBC bloggers main page, used in various columnist's blogs
# (Eg. Nick Robinson, Robert Preston).
social_links_reg_exp = '^.*social[_ -]*links.*$'
# Regular expression to remove quote and (multi) variant tags, Eg. 'quote',
# 'endquote', 'quote-credit', 'quote-credit-title', etc. These are usually
# removed by 'story-feature' removal (as they are usually within them), but
# not always. The quotation removed is always (AFAICT) in the article text
# as well but a 2nd copy is placed in a quote tag to draw attention to it.
# The quote class tags may or may not appear in div's.
quote_reg_exp = '^.*quote.*$'
# Regular expression to remove hidden and variant tags, Eg. 'hidden'.
# The purpose of these is unclear, they seem to be an internal link to a
# section within the article, but the text of the link (Eg. 'Continue reading
# the main story') never seems to be displayed anyway. Removed to avoid clutter.
# The hidden class tags may or may not appear in div's.
hidden_reg_exp = '^.*hidden.*$'
# Regular expression to remove comment and variant tags, Eg. 'comment-introduction'.
# Used on the site to display text about registered users entering comments.
comment_reg_exp = '^.*comment.*$'
# Regular expression to remove form and variant tags, Eg. 'comment-form'.
# Used on the site to allow registered BBC users to fill in forms, typically
# for entering comments about an article.
form_reg_exp = '^.*form.*$'
# Extra things to remove due to the addition of 'blq_content' in keep_only_tags.
#<div class="story-actions"> Used on sports pages for 'email' and 'print'.
story_actions_reg_exp = '^.*story[_ -]*actions.*$'
#<div class="bookmark-list"> Used on sports pages instead of 'share-help' (for
# social networking links).
bookmark_list_reg_exp = '^.*bookmark[_ -]*list.*$'
#<div id="secondary-content" class="content-group">
# NOTE: Don't remove class="content-group" that is needed.
# Used on sports pages to link to 'similar stories'.
secondary_content_reg_exp = '^.*secondary[_ -]*content.*$'
#<div id="featured-content" class="content-group">
# NOTE: Don't remove class="content-group" that is needed.
# Used on sports pages to link to pages like 'tables', 'fixtures', etc.
featured_content_reg_exp = '^.*featured[_ -]*content.*$'
#<div id="navigation">
# Used on sports pages to link to pages like 'tables', 'fixtures', etc.
# Used sometimes instead of "featured-content" above.
navigation_reg_exp = '^.*navigation.*$'
#<a class="skip" href="#blq-container-inner">Skip to top</a>
# Used on sports pages to link to the top of the page.
skip_reg_exp = '^.*skip.*$'
# Extra things to remove due to the addition of 'storycontent' in keep_only_tags,
# which are the alterative table design based pages. The purpose of some of these
# is not entirely clear from the pages (which are a total mess!).
# Remove mapping based tags, Eg. <map id="world_map">
# The dynamic maps don't seem to work during ebook creation. TO DO: Investigate.
map_reg_exp = '^.*map.*$'
# Remove social bookmarking variation, called 'socialBookMarks'.
social_bookmarks_reg_exp = '^.*social[_ -]*bookmarks.*$'
# Remove page navigation tools, like 'search', 'email', 'print', called 'blq-mast'.
blq_mast_reg_exp = '^.*blq[_ -]*mast.*$'
# Remove 'sharesb', I think this is a generic 'sharing' class. It seems to appear
# alongside 'socialBookMarks' whenever that appears. I am removing it as well
# under the assumption that it can appear alone as well.
sharesb_reg_exp = '^.*sharesb.*$'
# Remove class 'o'. The worst named user created css class of all time. The creator
# should immediately be fired. I've seen it used to hold nothing at all but with
# 20 or so empty lines in it. Also to hold a single link to another article.
# Whatever it was designed to do it is not wanted by this recipe. Exact match only.
o_reg_exp = '^o$'
# Remove 'promotopbg' and 'promobottombg', link lists. Have decided to
# use two reg expressions to make removing this (and variants) robust.
promo_top_reg_exp = '^.*promotopbg.*$'
promo_bottom_reg_exp = '^.*promobottombg.*$'
# Remove 'nlp', provides heading for link lists. Requires an exact match due to
# risk of matching those letters in something needed, unless I see a variation
# of 'nlp' used at a later date.
nlp_reg_exp = '^nlp$'
# Remove 'mva', provides embedded floating content of various types. Variant 'mvb'
# has also now been seen. Requires an exact match of 'mva' or 'mvb' due to risk of
# matching those letters in something needed.
mva_or_mvb_reg_exp = '^mv[ab]$'
# Remove 'mvtb', seems to be page navigation tools, like 'blq-mast'.
mvtb_reg_exp = '^mvtb$'
# Remove 'blq-toplink', class to provide a link to the top of the page.
blq_toplink_reg_exp = '^.*blq[_ -]*top[_ -]*link.*$'
# Remove 'products and services' links, Eg. desktop tools, alerts, and so on.
# Eg. Class="servicev4 ukfs_services" - what a mess of a name. Have decided to
# use two reg expressions to make removing this (and variants) robust.
prods_services_01_reg_exp = '^.*servicev4.*$'
prods_services_02_reg_exp = '^.*ukfs[_ -]*services.*$'
# Remove -what I think is- some kind of navigation tools helper class, though I am
# not sure, it's called: 'blq-rst blq-new-nav'. What I do know is it pops up
# frequently and it is not wanted. Have decided to use two reg expressions to make
# removing this (and variants) robust.
blq_misc_01_reg_exp = '^.*blq[_ -]*rst.*$'
blq_misc_02_reg_exp = '^.*blq[_ -]*new[_ -]*nav.*$'
# Remove 'puffbox' - this may only appear inside 'storyextra', so it may not
# need removing - I have no clue what it does other than it contains links.
# Whatever it is - it is not part of the article and is not wanted.
puffbox_reg_exp = '^.*puffbox.*$'
# Remove 'sibtbg' and 'sibtbgf' - some kind of table formatting classes.
sibtbg_reg_exp = '^.*sibtbg.*$'
# Remove 'storyextra' - links to relevant articles and external sites.
storyextra_reg_exp = '^.*story[_ -]*extra.*$'
remove_tags = [ dict(name='div', attrs={'class':re.compile(story_feature_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'class':re.compile(share_help_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'class':re.compile(embedded_hyper_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'class':re.compile(hypertabs_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'class':re.compile(video_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'class':re.compile(audio_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'class':re.compile(picture_gallery_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'class':re.compile(slideshow_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'class':re.compile(quote_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'class':re.compile(hidden_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'class':re.compile(comment_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'class':re.compile(story_actions_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'class':re.compile(bookmark_list_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'id':re.compile(secondary_content_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'id':re.compile(featured_content_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'id':re.compile(navigation_reg_exp, re.IGNORECASE)}),
dict(name='form', attrs={'id':re.compile(form_reg_exp, re.IGNORECASE)}),
dict(attrs={'class':re.compile(quote_reg_exp, re.IGNORECASE)}),
dict(attrs={'class':re.compile(hidden_reg_exp, re.IGNORECASE)}),
dict(attrs={'class':re.compile(social_links_reg_exp, re.IGNORECASE)}),
dict(attrs={'class':re.compile(comment_reg_exp, re.IGNORECASE)}),
dict(attrs={'class':re.compile(skip_reg_exp, re.IGNORECASE)}),
dict(name='map', attrs={'id':re.compile(map_reg_exp, re.IGNORECASE)}),
dict(name='map', attrs={'name':re.compile(map_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'id':re.compile(social_bookmarks_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'id':re.compile(blq_mast_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'class':re.compile(sharesb_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'class':re.compile(o_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'class':re.compile(promo_top_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'class':re.compile(promo_bottom_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'class':re.compile(nlp_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'class':re.compile(mva_or_mvb_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'class':re.compile(mvtb_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'class':re.compile(blq_toplink_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'class':re.compile(prods_services_01_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'class':re.compile(prods_services_02_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'class':re.compile(blq_misc_01_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'class':re.compile(blq_misc_02_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'class':re.compile(puffbox_reg_exp, re.IGNORECASE)}),
dict(attrs={'class':re.compile(sibtbg_reg_exp, re.IGNORECASE)}),
dict(attrs={'class':re.compile(storyextra_reg_exp, re.IGNORECASE)})
]
# Uses url to create and return the 'printer friendly' version of the url.
# In other words the 'print this page' address of the page.
#
# There are 3 types of urls used in the BBC site's rss feeds. There is just
# 1 type for the standard news while there are 2 used for sports feed urls.
# Note: Sports urls are linked from regular news feeds (Eg. 'News Home') when
# there is a major story of interest to 'everyone'. So even if no BBC sports
# feeds are added to 'feeds' the logic of this method is still needed to avoid
# blank / missing / empty articles which have an index title and then no body.
def print_version(self, url):
# Handle sports page urls type 01:
if (url.find("go/rss/-/sport1/") != -1):
temp_url = url.replace("go/rss/-/", "")
# Handle sports page urls type 02:
elif (url.find("go/rss/int/news/-/sport1/") != -1):
temp_url = url.replace("go/rss/int/news/-/", "")
# Handle regular news page urls:
else:
temp_url = url.replace("go/rss/int/news/-/", "")
# Always add "?print=true" to the end of the url.
print_url = temp_url + "?print=true"
return print_url
# Remove articles in feeds based on a string in the article title or url.
#
# Code logic written by: Starson17 - posted in: "Recipes - Re-usable code"
# thread, in post with title: "Remove articles from feed", see url:
# http://www.mobileread.com/forums/showpost.php?p=1165462&postcount=6
# Many thanks and all credit to Starson17.
#
# Starson17's code has obviously been altered to suite my requirements.
def parse_feeds(self):
# Call parent's method.
feeds = BasicNewsRecipe.parse_feeds(self)
# Loop through all feeds.
for feed in feeds:
# Loop through all articles in feed.
for article in feed.articles[:]:
# Match key words and remove article if there's a match.
# Most BBC rss feed video only 'articles' use upper case 'VIDEO'
# as a title prefix. Just match upper case 'VIDEO', so that
# articles like 'Video game banned' won't be matched and removed.
if 'VIDEO' in article.title:
feed.articles.remove(article)
# Most BBC rss feed audio only 'articles' use upper case 'AUDIO'
# as a title prefix. Just match upper case 'AUDIO', so that
# articles like 'Hi-Def audio...' won't be matched and removed.
elif 'AUDIO' in article.title:
feed.articles.remove(article)
# Most BBC rss feed photo slideshow 'articles' use 'In Pictures',
# 'In pictures', and 'in pictures', somewhere in their title.
# Match any case of that phrase.
elif 'IN PICTURES' in article.title.upper():
feed.articles.remove(article)
# As above, but user contributed pictures. Match any case.
elif 'YOUR PICTURES' in article.title.upper():
feed.articles.remove(article)
# 'Sportsday Live' are articles which contain a constantly and
# dynamically updated 'running commentary' during a live sporting
# event. Match any case.
elif 'SPORTSDAY LIVE' in article.title.upper():
feed.articles.remove(article)
# Sometimes 'Sportsday Live' (above) becomes 'Live - Sport Name'.
# These are being matched below using 'Live - ' because removing all
# articles with 'live' in their titles would remove some articles
# that are in fact not live sports pages. Match any case.
elif 'LIVE - ' in article.title.upper():
feed.articles.remove(article)
# 'Quiz of the week' is a Flash player weekly news quiz. Match only
# the 'Quiz of the' part in anticipation of monthly and yearly
# variants. Match any case.
elif 'QUIZ OF THE' in article.title.upper():
feed.articles.remove(article)
# Remove articles with 'scorecards' in the url. These are BBC sports
# pages which just display a cricket scorecard. The pages have a mass
# of table and css entries to display the scorecards nicely. Probably
# could make them work with this recipe, but might take a whole day
# of work to sort out all the css - basically a formatting nightmare.
elif 'scorecards' in article.url:
feed.articles.remove(article)
return feeds
# End of class and file.

View File

@ -1,35 +1,43 @@
#!/usr/bin/env python #!/usr/bin/env python
__license__ = 'GPL v3' __license__ = 'GPL v3'
__author__ = 'Gerardo Diez' __copyright__ = '5, January 2011 Gerardo Diez<gerardo.diez.garcia@gmail.com> & desUBIKado'
__copyright__ = 'Gerardo Diez<gerardo.diez.garcia@gmail.com>' __author__ = 'desUBIKado, based on an earlier version by Gerardo Diez'
description = 'Main daily newspaper from Spain - v1.00 (05, Enero 2011)' __version__ = 'v1.01'
__docformat__ = 'restructuredtext en' __date__ = '13, November 2011'
''' '''
expansion.es [url]http://www.expansion.com/[/url]
''' '''
import time
import re
from calibre.web.feeds.recipes import BasicNewsRecipe from calibre.web.feeds.recipes import BasicNewsRecipe
class Publico(BasicNewsRecipe):
title =u'Expansion.com' class expansion_spanish(BasicNewsRecipe):
__author__ ='Gerardo Diez' __author__ ='Gerardo Diez & desUBIKado'
publisher =u'Unidad Editorial Información Económica, S.L.' description ='Financial news from Spain'
category ='finances, catalunya' title =u'Expansion'
oldest_article =1 publisher =u'Unidad Editorial Internet, S.L.'
category ='news, finances, Spain'
oldest_article = 2
simultaneous_downloads = 10
max_articles_per_feed =100 max_articles_per_feed =100
simultaneous_downloads =10 timefmt = '[%a, %d %b, %Y]'
cover_url =u'http://estaticos01.expansion.com/iconos/v2.x/v2.0/cabeceras/logo_expansion.png' encoding ='iso-8859-15'
timefmt ='[%A, %d %B, %Y]'
encoding ='latin'
language ='es' language ='es'
remove_javascript =True use_embedded_content = False
no_stylesheets =True remove_javascript = True
no_stylesheets = True
remove_empty_feeds = True
keep_only_tags =dict(name='div', attrs={'class':['noticia primer_elemento']}) keep_only_tags =dict(name='div', attrs={'class':['noticia primer_elemento']})
remove_tags =[ remove_tags =[
dict(name='div', attrs={'class':['compartir', 'metadata_desarrollo_noticia', 'relacionadas', 'mas_info','publicidad publicidad_textlink', 'ampliarfoto']}), dict(name='div', attrs={'class':['compartir', 'metadata_desarrollo_noticia', 'relacionadas', 'mas_info','publicidad publicidad_textlink', 'ampliarfoto','tit_relacionadas','interact','paginacion estirar','sumario derecha']}),
dict(name='ul', attrs={'class':['bolos_desarrollo_noticia']}), dict(name='ul', attrs={'class':['bolos_desarrollo_noticia','not_logged']}),
dict(name='span', attrs={'class':['comentarios']}), dict(name='span', attrs={'class':['comentarios']}),
dict(name='p', attrs={'class':['cintillo_comentarios', 'cintillo_comentarios formulario']}), dict(name='p', attrs={'class':['cintillo_comentarios', 'cintillo_comentarios formulario']}),
dict(name='div', attrs={'id':['comentarios_lectores_listado']}) dict(name='div', attrs={'id':['comentarios_lectores_listado','comentar']})
] ]
feeds =[ feeds =[
(u'Portada', u'http://estaticos.expansion.com/rss/portada.xml'), (u'Portada', u'http://estaticos.expansion.com/rss/portada.xml'),
@ -38,42 +46,112 @@ class Publico(BasicNewsRecipe):
(u'Euribor', u'http://estaticos.expansion.com/rss/mercadoseuribor.xml'), (u'Euribor', u'http://estaticos.expansion.com/rss/mercadoseuribor.xml'),
(u'Materias Primas', u'http://estaticos.expansion.com/rss/mercadosmateriasprimas.xml'), (u'Materias Primas', u'http://estaticos.expansion.com/rss/mercadosmateriasprimas.xml'),
(u'Renta Fija', u'http://estaticos.expansion.com/rss/mercadosrentafija.xml'), (u'Renta Fija', u'http://estaticos.expansion.com/rss/mercadosrentafija.xml'),
(u'Portada: Mi Dinero', u'http://estaticos.expansion.com/rss/midinero.xml'), (u'Portada: Mi Dinero', u'http://estaticos.expansion.com/rss/midinero.xml'),
(u'Hipotecas', u'http://estaticos.expansion.com/rss/midinerohipotecas.xml'), (u'Hipotecas', u'http://estaticos.expansion.com/rss/midinerohipotecas.xml'),
(u'Créditos', u'http://estaticos.expansion.com/rss/midinerocreditos.xml'), (u'Cr\xe9ditos', u'http://estaticos.expansion.com/rss/midinerocreditos.xml'),
(u'Pensiones', u'http://estaticos.expansion.com/rss/midineropensiones.xml'), (u'Pensiones', u'http://estaticos.expansion.com/rss/midineropensiones.xml'),
(u'Fondos de Inversión', u'http://estaticos.expansion.com/rss/midinerofondos.xml'), (u'Fondos de Inversi\xf3n', u'http://estaticos.expansion.com/rss/midinerofondos.xml'),
(u'Motor', u'http://estaticos.expansion.com/rss/midineromotor.xml'), (u'Motor', u'http://estaticos.expansion.com/rss/midineromotor.xml'),
(u'Portada: Empresas', u'http://estaticos.expansion.com/rss/empresas.xml'), (u'Portada: Empresas', u'http://estaticos.expansion.com/rss/empresas.xml'),
(u'Banca', u'http://estaticos.expansion.com/rss/empresasbanca.xml'), (u'Banca', u'http://estaticos.expansion.com/rss/empresasbanca.xml'),
(u'TMT', u'http://estaticos.expansion.com/rss/empresastmt.xml'), (u'TMT', u'http://estaticos.expansion.com/rss/empresastmt.xml'),
(u'Energía', u'http://estaticos.expansion.com/rss/empresasenergia.xml'), (u'Energ\xeda', u'http://estaticos.expansion.com/rss/empresasenergia.xml'),
(u'Inmobiliario y Construcción', u'http://estaticos.expansion.com/rss/empresasinmobiliario.xml'), (u'Inmobiliario y Construcci\xf3n', u'http://estaticos.expansion.com/rss/empresasinmobiliario.xml'),
(u'Transporte y Turismo', u'http://estaticos.expansion.com/rss/empresastransporte.xml'), (u'Transporte y Turismo', u'http://estaticos.expansion.com/rss/empresastransporte.xml'),
(u'Automoción e Industria', u'http://estaticos.expansion.com/rss/empresasauto-industria.xml'), (u'Automoci\xf3n e Industria', u'http://estaticos.expansion.com/rss/empresasauto-industria.xml'),
(u'Distribución', u'http://estaticos.expansion.com/rss/empresasdistribucion.xml'), (u'Distribuci\xf3n', u'http://estaticos.expansion.com/rss/empresasdistribucion.xml'),
(u'Deporte y Negocio', u' http://estaticos.expansion.com/rss/empresasdeporte.xml'), (u'Deporte y Negocio', u' [url]http://estaticos.expansion.com/rss/empresasdeporte.xml[/url]'),
(u'Mi Negocio', u'http://estaticos.expansion.com/rss/empresasminegocio.xml'), (u'Mi Negocio', u'http://estaticos.expansion.com/rss/empresasminegocio.xml'),
(u'Interiores', u'http://estaticos.expansion.com/rss/empresasinteriores.xml'), (u'Interiores', u'http://estaticos.expansion.com/rss/empresasinteriores.xml'),
(u'Digitech', u'http://estaticos.expansion.com/rss/empresasdigitech.xml'), (u'Digitech', u'http://estaticos.expansion.com/rss/empresasdigitech.xml'),
(u'Portada: Econom\xeda y Pol\xedtica', u'http://estaticos.expansion.com/rss/economiapolitica.xml'),
(u'Portada: Economía y Política', u'http://estaticos.expansion.com/rss/economiapolitica.xml'), (u'Pol\xedtica', u'http://estaticos.expansion.com/rss/economia.xml'),
(u'Política', u'http://estaticos.expansion.com/rss/economia.xml'),
(u'Portada: Sociedad', u'http://estaticos.expansion.com/rss/entorno.xml'), (u'Portada: Sociedad', u'http://estaticos.expansion.com/rss/entorno.xml'),
(u'Portada: Opini\xf3n', u'http://estaticos.expansion.com/rss/opinion.xml'),
(u'Portada: Opinión', u'http://estaticos.expansion.com/rss/opinion.xml'),
(u'Llaves y editoriales', u'http://estaticos.expansion.com/rss/opinioneditorialyllaves.xml'), (u'Llaves y editoriales', u'http://estaticos.expansion.com/rss/opinioneditorialyllaves.xml'),
(u'Tribunas', u'http://estaticos.expansion.com/rss/opiniontribunas.xml'), (u'Tribunas', u'http://estaticos.expansion.com/rss/opiniontribunas.xml'),
(u'Portada: Jur\xeddico', u'http://estaticos.expansion.com/rss/juridico.xml'),
(u'Portada: Jurídico', u'http://estaticos.expansion.com/rss/juridico.xml'),
(u'Entrevistas', u'http://estaticos.expansion.com/rss/juridicoentrevistas.xml'), (u'Entrevistas', u'http://estaticos.expansion.com/rss/juridicoentrevistas.xml'),
(u'Opinión', u'http://estaticos.expansion.com/rss/juridicoopinion.xml'), (u'Opini\xf3n', u'http://estaticos.expansion.com/rss/juridicoopinion.xml'),
(u'Sentencias', u'http://estaticos.expansion.com/rss/juridicosentencias.xml'), (u'Sentencias', u'http://estaticos.expansion.com/rss/juridicosentencias.xml'),
(u'Mujer', u'http://estaticos.expansion.com/rss/mujer-empresa.xml'), (u'Mujer', u'http://estaticos.expansion.com/rss/mujer-empresa.xml'),
(u'Catalu&ntilde;a', u'http://estaticos.expansion.com/rss/catalunya.xml'), (u'Catalu\xf1a', u'http://estaticos.expansion.com/rss/catalunya.xml'),
(u'Función pública', u'http://estaticos.expansion.com/rss/funcion-publica.xml') (u'Funci\xf3n p\xfablica', u'http://estaticos.expansion.com/rss/funcion-publica.xml')
] ]
# Obtener la imagen de portada
def get_cover_url(self):
cover = None
st = time.localtime()
year = str(st.tm_year)
month = "%.2d" % st.tm_mon
day = "%.2d" % st.tm_mday
#[url]http://img5.kiosko.net/2011/11/14/es/expansion.750.jpg[/url]
cover='http://img5.kiosko.net/'+ year + '/' + month + '/' + day +'/es/expansion.750.jpg'
br = BasicNewsRecipe.get_browser()
try:
br.open(cover)
except:
self.log("\nPortada no disponible")
cover ='http://www.aproahp.org/enlaces/images/diario_expansion.gif'
return cover
# Para que no salte la publicidad al recuperar la noticia, y que siempre se recupere
# la página web, mando la variable "t" con la hora "linux" o "epoch" actual
# haciendole creer al sitio web que justo se acaba de ver la publicidad
def print_version(self, url):
st = time.time()
segundos = str(int(st))
parametros = '.html?t=' + segundos
return url.replace('.html', parametros)
_processed_links = []
def get_article_url(self, article):
# Para obtener la url original del artículo a partir de la de "feedsportal"
link = article.get('link', None)
if link is None:
return article
if link.split('/')[-1]=="story01.htm":
link=link.split('/')[-2]
a=['0B','0C','0D','0E','0F','0G','0N' ,'0L0S','0A']
b=['.' ,'/' ,'?' ,'-' ,'=' ,'&' ,'.com','www.','0']
for i in range(0,len(a)):
link=link.replace(a[i],b[i])
link="http://"+link
# Eliminar artículos duplicados en otros feeds
if not (link in self._processed_links):
self._processed_links.append(link)
else:
link = None
return link
# Un poco de css para mejorar la presentación de las noticias
extra_css = '''
.entradilla {font-family:Arial,Helvetica,sans-serif; font-weight:bold; font-style:italic; font-size:16px;}
.fecha_publicacion,.autor {font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:14px;}
'''
# Para presentar la imagen de los videos incrustados
preprocess_regexps = [
(re.compile(r'var imagen', re.DOTALL|re.IGNORECASE), lambda match: '--></script><img src'),
(re.compile(r'.jpg";', re.DOTALL|re.IGNORECASE), lambda match: '.jpg">'),
(re.compile(r'var id_reproductor', re.DOTALL|re.IGNORECASE), lambda match: '<script language="Javascript" type="text/javascript"><!--'),
]

View File

@ -80,59 +80,11 @@ class Nin(BasicNewsRecipe):
return self.PREFIX + item.img['src'] return self.PREFIX + item.img['src']
return cover_url return cover_url
def parse_index(self): feeds = [(u'NIN Online', u'http://www.nin.co.rs/misc/rss.php?feed=RSS2.0')]
articles = []
count = 0 def get_article_url(self, article):
soup = self.index_to_soup(self.INDEX) url = BasicNewsRecipe.get_article_url(self, article)
for item in soup.findAll('a',attrs={'class':'lmeninavFont'}): return url.replace('.co.yu', '.co.rs')
count = count +1
if self.test and count > 2:
return articles
section = self.tag_to_string(item)
feedlink = self.PREFIX + item['href']
feedpage = self.index_to_soup(feedlink)
self.report_progress(0, _('Fetching feed')+' %s...'%(section))
inarts = []
for art in feedpage.findAll('span',attrs={'class':'artTitle'}):
alink = art.parent
url = self.PREFIX + alink['href']
title = self.tag_to_string(art)
sparent = alink.parent
alink.extract()
description = self.tag_to_string(sparent)
date = strftime(self.timefmt)
inarts.append({
'title' :title
,'date' :date
,'url' :url
,'description':description
})
articles.append((section,inarts))
return articles
def index_to_soup(self, url_or_raw, raw=False):
if re.match(r'\w+://', url_or_raw):
open_func = getattr(self.browser, 'open_novisit', self.browser.open)
with closing(open_func(url_or_raw)) as f:
_raw = f.read()
if not _raw:
raise RuntimeError('Could not fetch index from %s'%url_or_raw)
else:
_raw = url_or_raw
if raw:
return _raw
if not isinstance(_raw, unicode) and self.encoding:
if callable(self.encoding):
_raw = self.encoding(_raw)
else:
_raw = _raw.decode(self.encoding, 'replace')
massage = list(BeautifulSoup.MARKUP_MASSAGE)
enc = 'cp1252' if callable(self.encoding) or self.encoding is None else self.encoding
massage.append((re.compile(r'&(\S+?);'), lambda match:
entity_to_unicode(match, encoding=enc)))
massage.append((re.compile(r'[\x00-\x08]+'), lambda match:
''))
return BeautifulSoup(_raw, markupMassage=massage)
def preprocess_html(self, soup): def preprocess_html(self, soup):
for item in soup.findAll(style=True): for item in soup.findAll(style=True):

View File

@ -11,17 +11,16 @@ from calibre.web.feeds.news import BasicNewsRecipe
class Salon_com(BasicNewsRecipe): class Salon_com(BasicNewsRecipe):
title = 'Salon.com' title = 'Salon.com'
__author__ = 'cix3' __author__ = 'Kovid Goyal'
description = 'Salon.com - Breaking news, opinion, politics, entertainment, sports and culture.' description = 'Salon.com - Breaking news, opinion, politics, entertainment, sports and culture.'
timefmt = ' [%b %d, %Y]' timefmt = ' [%b %d, %Y]'
language = 'en' language = 'en'
oldest_article = 7 oldest_article = 7
max_articles_per_feed = 100 max_articles_per_feed = 100
auto_cleanup = True
remove_tags = [dict(name='div', attrs={'class':['ad_content', 'clearfix']}), dict(name='hr'), dict(name='img')] auto_cleanup_keep = '//div[@class="art"]'
remove_empty_feeds = True
remove_tags_before = dict(name='h2')
feeds = [ feeds = [
('News & Politics', 'http://feeds.salon.com/salon/news'), ('News & Politics', 'http://feeds.salon.com/salon/news'),
@ -40,5 +39,5 @@ class Salon_com(BasicNewsRecipe):
] ]
def print_version(self, url): def print_version(self, url):
return url.replace('/index.html', '/print.html') return url + '/print/'

View File

@ -0,0 +1,17 @@
from calibre.web.feeds.news import BasicNewsRecipe
class Worldcrunch(BasicNewsRecipe):
title = u'Worldcrunch'
__author__ = 'Krittika Goyal'
oldest_article = 1 #days
max_articles_per_feed = 25
use_embedded_content = False
no_stylesheets = True
auto_cleanup = True
feeds = [
('News',
'http://www.worldcrunch.com/feed'),
]

View File

@ -201,15 +201,49 @@ save_template_title_series_sorting = 'library_order'
#: Set the list of words considered to be "articles" for sort strings #: Set the list of words considered to be "articles" for sort strings
# Set the list of words that are to be considered 'articles' when computing the # Set the list of words that are to be considered 'articles' when computing the
# title sort strings. The list is a regular expression, with the articles # title sort strings. The articles differ by language. By default, calibre uses
# separated by 'or' bars. Comparisons are case insensitive, and that cannot be # a combination of articles from English and whatever language the calibre user
# changed. Changes to this tweak won't have an effect until the book is modified # interface is set to. In addition, in some contexts where the book language is
# in some way. If you enter an invalid pattern, it is silently ignored. # available, the language of the book is used. You can change the list of
# To disable use the expression: '^$' # articles for a given language or add a new language by editing
# This expression is designed for articles that are followed by spaces. If you # per_language_title_sort_articles. To tell calibre to use a language other
# also need to match articles that are followed by other characters, for example L' # than the user interface language, set, default_language_for_title_sort. For
# in French, use: "^(A\s+|The\s+|An\s+|L')" instead. # example, to use German, set it to 'deu'. A value of None means the user
# Default: '^(A|The|An)\s+' # interface language is used. The setting title_sort_articles is ignored
# (present only for legacy reasons).
per_language_title_sort_articles = {
# English
'eng' : (r'A\s+', r'The\s+', r'An\s+'),
# Spanish
'spa' : (r'El\s+', r'La\s+', r'Lo\s+', r'Los\s+', r'Las\s+', r'Un\s+',
r'Una\s+', r'Unos\s+', r'Unas\s+'),
# French
'fra' : (r'Le\s+', r'La\s+', r"L'", r'Les\s+', r'Un\s+', r'Une\s+',
r'Des\s+'),
# Italian
'ita' : (r'Lo\s+', r'Il\s+', r"L'", r'La\s+', r'Gli\s+', r'I\s+',
r'Le\s+', ),
# Portuguese
'por' : (r'A\s+', r'O\s+', r'Os\s+', r'As\s+', r'Um\s+', r'Uns\s+',
r'Uma\s+', r'Umas\s+', ),
# Romanian
'ron' : (r'Un\s+', r'O\s+', r'Nişte\s+', ),
# German
'deu' : (r'Der\s+', r'Die\s+', r'Das\s+', r'Den\s+', r'Ein\s+',
r'Eine\s+', r'Einen\s+', ),
# Dutch
'nld' : (r'De\s+', r'Het\s+', r'Een\s+', ),
# Swedish
'swe' : (r'En\s+', r'Ett\s+', r'Det\s+', r'Den\s+', r'De\s+', ),
# Turkish
'tur' : (r'Bir\s+', ),
# Afrikaans
'afr' : (r"'n\s+", r'Die\s+', ),
# Greek
'ell' : (r'O\s+', r'I\s+', r'To\s+', r'Ta\s+', r'Tus\s+', r'Tis\s+',
r"'Enas\s+", r"'Mia\s+", r"'Ena\s+", r"'Enan\s+", ),
}
default_language_for_title_sort = None
title_sort_articles=r'^(A|The|An)\s+' title_sort_articles=r'^(A|The|An)\s+'
#: Specify a folder calibre should connect to at startup #: Specify a folder calibre should connect to at startup

View File

@ -567,7 +567,7 @@ from calibre.devices.nuut2.driver import NUUT2
from calibre.devices.iriver.driver import IRIVER_STORY from calibre.devices.iriver.driver import IRIVER_STORY
from calibre.devices.binatone.driver import README from calibre.devices.binatone.driver import README
from calibre.devices.hanvon.driver import (N516, EB511, ALEX, AZBOOKA, THEBOOK, from calibre.devices.hanvon.driver import (N516, EB511, ALEX, AZBOOKA, THEBOOK,
LIBREAIR) LIBREAIR, ODYSSEY)
from calibre.devices.edge.driver import EDGE from calibre.devices.edge.driver import EDGE
from calibre.devices.teclast.driver import (TECLAST_K3, NEWSMY, IPAPYRUS, from calibre.devices.teclast.driver import (TECLAST_K3, NEWSMY, IPAPYRUS,
SOVOS, PICO, SUNSTECH_EB700, ARCHOS7O, STASH, WEXLER) SOVOS, PICO, SUNSTECH_EB700, ARCHOS7O, STASH, WEXLER)
@ -689,7 +689,7 @@ plugins += [
IPAPYRUS, IPAPYRUS,
EDGE, EDGE,
SNE, SNE,
ALEX, ALEX, ODYSSEY,
PALMPRE, PALMPRE,
KOBO, KOBO,
AZBOOKA, AZBOOKA,

View File

@ -757,6 +757,7 @@ output_profiles = [OutputProfile, SonyReaderOutput, SonyReader300Output,
iPadOutput, KoboReaderOutput, TabletOutput, SamsungGalaxy, iPadOutput, KoboReaderOutput, TabletOutput, SamsungGalaxy,
SonyReaderLandscapeOutput, KindleDXOutput, IlliadOutput, SonyReaderLandscapeOutput, KindleDXOutput, IlliadOutput,
IRexDR1000Output, IRexDR800Output, JetBook5Output, NookOutput, IRexDR1000Output, IRexDR800Output, JetBook5Output, NookOutput,
BambookOutput, NookColorOutput, PocketBook900Output, GenericEink, GenericEinkLarge] BambookOutput, NookColorOutput, PocketBook900Output, GenericEink,
GenericEinkLarge, KindleFireOutput]
output_profiles.sort(cmp=lambda x,y:cmp(x.name.lower(), y.name.lower())) output_profiles.sort(cmp=lambda x,y:cmp(x.name.lower(), y.name.lower()))

View File

@ -166,12 +166,12 @@ class ANDROID(USBMS):
'MB525', 'ANDROID2.3', 'SGH-I997', 'GT-I5800_CARD', 'MB612', 'MB525', 'ANDROID2.3', 'SGH-I997', 'GT-I5800_CARD', 'MB612',
'GT-S5830_CARD', 'GT-S5570_CARD', 'MB870', 'MID7015A', 'GT-S5830_CARD', 'GT-S5570_CARD', 'MB870', 'MID7015A',
'ALPANDIGITAL', 'ANDROID_MID', 'VTAB1008', 'EMX51_BBG_ANDROI', 'ALPANDIGITAL', 'ANDROID_MID', 'VTAB1008', 'EMX51_BBG_ANDROI',
'UMS', '.K080', 'P990'] 'UMS', '.K080', 'P990', 'LTE']
WINDOWS_CARD_A_MEM = ['ANDROID_PHONE', 'GT-I9000_CARD', 'SGH-I897', WINDOWS_CARD_A_MEM = ['ANDROID_PHONE', 'GT-I9000_CARD', 'SGH-I897',
'FILE-STOR_GADGET', 'SGH-T959', 'SAMSUNG_ANDROID', 'GT-P1000_CARD', 'FILE-STOR_GADGET', 'SGH-T959', 'SAMSUNG_ANDROID', 'GT-P1000_CARD',
'A70S', 'A101IT', '7', 'INCREDIBLE', 'A7EB', 'SGH-T849_CARD', 'A70S', 'A101IT', '7', 'INCREDIBLE', 'A7EB', 'SGH-T849_CARD',
'__UMS_COMPOSITE', 'SGH-I997_CARD', 'MB870', 'ALPANDIGITAL', '__UMS_COMPOSITE', 'SGH-I997_CARD', 'MB870', 'ALPANDIGITAL',
'ANDROID_MID', 'P990_SD_CARD', '.K080'] 'ANDROID_MID', 'P990_SD_CARD', '.K080', 'LTE_CARD']
OSX_MAIN_MEM = 'Android Device Main Memory' OSX_MAIN_MEM = 'Android Device Main Memory'

View File

@ -217,6 +217,7 @@ class ITUNES(DriverBase):
# 0x1297 iPhone 4 # 0x1297 iPhone 4
# 0x129a iPad # 0x129a iPad
# 0x129f iPad2 (WiFi) # 0x129f iPad2 (WiFi)
# 0x12a0 iPhone 4S
# 0x12a2 iPad2 (GSM) # 0x12a2 iPad2 (GSM)
# 0x12a3 iPad2 (CDMA) # 0x12a3 iPad2 (CDMA)
VENDOR_ID = [0x05ac] VENDOR_ID = [0x05ac]
@ -1305,6 +1306,8 @@ class ITUNES(DriverBase):
if DEBUG: if DEBUG:
self.log.info(" ITUNES._add_new_copy()") self.log.info(" ITUNES._add_new_copy()")
self._update_epub_metadata(fpath, metadata)
db_added = None db_added = None
lb_added = None lb_added = None
@ -1409,10 +1412,16 @@ class ITUNES(DriverBase):
tmp_cover.write(cover_data) tmp_cover.write(cover_data)
if lb_added: if lb_added:
if lb_added.Artwork.Count: try:
lb_added.Artwork.Item(1).SetArtworkFromFile(tc) if lb_added.Artwork.Count:
else: lb_added.Artwork.Item(1).SetArtworkFromFile(tc)
lb_added.AddArtworkFromFile(tc) else:
lb_added.AddArtworkFromFile(tc)
except:
if DEBUG:
self.log.warning(" iTunes automation interface reported an error"
" when adding artwork to '%s' in the iTunes Library" % metadata.title)
pass
if db_added: if db_added:
if db_added.Artwork.Count: if db_added.Artwork.Count:
@ -2638,68 +2647,61 @@ class ITUNES(DriverBase):
# Refresh epub metadata # Refresh epub metadata
with open(fpath,'r+b') as zfo: with open(fpath,'r+b') as zfo:
# Touch the OPF timestamp if False:
try: try:
zf_opf = ZipFile(fpath,'r') zf_opf = ZipFile(fpath,'r')
fnames = zf_opf.namelist() fnames = zf_opf.namelist()
opf = [x for x in fnames if '.opf' in x][0] opf = [x for x in fnames if '.opf' in x][0]
except: except:
raise UserFeedback("'%s' is not a valid EPUB" % metadata.title, raise UserFeedback("'%s' is not a valid EPUB" % metadata.title,
None, None,
level=UserFeedback.WARN) level=UserFeedback.WARN)
#Touch the OPF timestamp
opf_tree = etree.fromstring(zf_opf.read(opf))
md_els = opf_tree.xpath('.//*[local-name()="metadata"]')
if md_els:
ts = md_els[0].find('.//*[@name="calibre:timestamp"]')
if ts is not None:
timestamp = ts.get('content')
old_ts = parse_date(timestamp)
metadata.timestamp = datetime.datetime(old_ts.year, old_ts.month, old_ts.day, old_ts.hour,
old_ts.minute, old_ts.second, old_ts.microsecond+1, old_ts.tzinfo)
if DEBUG:
self.log.info(" existing timestamp: %s" % metadata.timestamp)
else:
metadata.timestamp = now()
if DEBUG:
self.log.info(" add timestamp: %s" % metadata.timestamp)
opf_tree = etree.fromstring(zf_opf.read(opf))
md_els = opf_tree.xpath('.//*[local-name()="metadata"]')
if md_els:
ts = md_els[0].find('.//*[@name="calibre:timestamp"]')
if ts is not None:
timestamp = ts.get('content')
old_ts = parse_date(timestamp)
metadata.timestamp = datetime.datetime(old_ts.year, old_ts.month, old_ts.day, old_ts.hour,
old_ts.minute, old_ts.second, old_ts.microsecond+1, old_ts.tzinfo)
if DEBUG:
self.log.info(" existing timestamp: %s" % metadata.timestamp)
else: else:
metadata.timestamp = now() metadata.timestamp = now()
if DEBUG: if DEBUG:
self.log.warning(" missing <metadata> block in OPF file")
self.log.info(" add timestamp: %s" % metadata.timestamp) self.log.info(" add timestamp: %s" % metadata.timestamp)
else:
metadata.timestamp = now()
if DEBUG:
self.log.warning(" missing <metadata> block in OPF file")
self.log.info(" add timestamp: %s" % metadata.timestamp)
# Force the language declaration for iBooks 1.1
#metadata.language = get_lang().replace('_', '-')
# Updates from metadata plugboard (ignoring publisher) zf_opf.close()
metadata.language = metadata_x.language
if DEBUG:
if metadata.language != metadata_x.language:
self.log.info(" rewriting language: <dc:language>%s</dc:language>" % metadata.language)
zf_opf.close()
# If 'News' in tags, tweak the title/author for friendlier display in iBooks # If 'News' in tags, tweak the title/author for friendlier display in iBooks
if _('News') in metadata.tags or \ if _('News') in metadata_x.tags or \
_('Catalog') in metadata.tags: _('Catalog') in metadata_x.tags:
if metadata.title.find('[') > 0: if metadata_x.title.find('[') > 0:
metadata.title = metadata.title[:metadata.title.find('[')-1] metadata_x.title = metadata_x.title[:metadata_x.title.find('[')-1]
date_as_author = '%s, %s %s, %s' % (strftime('%A'), strftime('%B'), strftime('%d').lstrip('0'), strftime('%Y')) date_as_author = '%s, %s %s, %s' % (strftime('%A'), strftime('%B'), strftime('%d').lstrip('0'), strftime('%Y'))
metadata.author = metadata.authors = [date_as_author] metadata_x.author = metadata_x.authors = [date_as_author]
sort_author = re.sub('^\s*A\s+|^\s*The\s+|^\s*An\s+', '', metadata.title).rstrip() sort_author = re.sub('^\s*A\s+|^\s*The\s+|^\s*An\s+', '', metadata_x.title).rstrip()
metadata.author_sort = '%s %s' % (sort_author, strftime('%Y-%m-%d')) metadata_x.author_sort = '%s %s' % (sort_author, strftime('%Y-%m-%d'))
# Remove any non-alpha category tags # Remove any non-alpha category tags
for tag in metadata.tags: for tag in metadata_x.tags:
if not self._is_alpha(tag[0]): if not self._is_alpha(tag[0]):
metadata.tags.remove(tag) metadata_x.tags.remove(tag)
# If windows & series, nuke tags so series used as Category during _update_iTunes_metadata() # If windows & series, nuke tags so series used as Category during _update_iTunes_metadata()
if iswindows and metadata.series: if iswindows and metadata_x.series:
metadata.tags = None metadata_x.tags = None
set_metadata(zfo, metadata, update_timestamp=True) set_metadata(zfo, metadata_x, apply_null=True, update_timestamp=True)
def _update_device(self, msg='', wait=True): def _update_device(self, msg='', wait=True):
''' '''
@ -2771,6 +2773,8 @@ class ITUNES(DriverBase):
lb_added.sort_name.set(metadata_x.title_sort) lb_added.sort_name.set(metadata_x.title_sort)
if db_added: if db_added:
self.log.warning(" waiting for db_added to become writeable ")
time.sleep(1.0)
db_added.name.set(metadata_x.title) db_added.name.set(metadata_x.title)
db_added.album.set(metadata_x.title) db_added.album.set(metadata_x.title)
db_added.artist.set(authors_to_string(metadata_x.authors)) db_added.artist.set(authors_to_string(metadata_x.authors))
@ -2826,6 +2830,8 @@ class ITUNES(DriverBase):
break break
if db_added: if db_added:
self.log.warning(" waiting for db_added to become writeable ")
time.sleep(1.0)
# If no title_sort plugboard tweak, create sort_name from series/index # If no title_sort plugboard tweak, create sort_name from series/index
if metadata.title_sort == metadata_x.title_sort: if metadata.title_sort == metadata_x.title_sort:
db_added.sort_name.set("%s %s" % (self.title_sorter(metadata_x.series), series_index)) db_added.sort_name.set("%s %s" % (self.title_sorter(metadata_x.series), series_index))
@ -2866,6 +2872,8 @@ class ITUNES(DriverBase):
lb_added.SortName = metadata_x.title_sort lb_added.SortName = metadata_x.title_sort
if db_added: if db_added:
self.log.warning(" waiting for db_added to become writeable ")
time.sleep(1.0)
db_added.Name = metadata_x.title db_added.Name = metadata_x.title
db_added.Album = metadata_x.title db_added.Album = metadata_x.title
db_added.Artist = authors_to_string(metadata_x.authors) db_added.Artist = authors_to_string(metadata_x.authors)

View File

@ -164,4 +164,21 @@ class EB511(USBMS):
OSX_MAIN_MEM_VOL_PAT = re.compile(r'/eReader') OSX_MAIN_MEM_VOL_PAT = re.compile(r'/eReader')
class ODYSSEY(N516):
name = 'Cybook Odyssey driver'
gui_name = 'Odyssey'
description = _('Communicate with the Cybook Odyssey eBook reader.')
BCD = [0x316]
VENDOR_NAME = 'LINUX'
WINDOWS_MAIN_MEM = WINDOWS_CARD_A_MEM = 'FILE-STOR_GADGET'
FORMATS = ['epub', 'fb2', 'html', 'pdf', 'txt']
EBOOK_DIR_MAIN = 'calibre'
def get_main_ebook_dir(self, for_upload=False):
if for_upload:
return self.EBOOK_DIR_MAIN
return ''

View File

@ -95,18 +95,33 @@ def author_to_author_sort(author, method=None):
def authors_to_sort_string(authors): def authors_to_sort_string(authors):
return ' & '.join(map(author_to_author_sort, authors)) return ' & '.join(map(author_to_author_sort, authors))
try: _title_pats = {}
_title_pat = re.compile(tweaks.get('title_sort_articles', def get_title_sort_pat(lang=None):
r'^(A|The|An)\s+'), re.IGNORECASE) ans = _title_pats.get(lang, None)
except: if ans is not None:
print 'Error in title sort pattern' return ans
import traceback q = lang
traceback.print_exc() from calibre.utils.localization import canonicalize_lang, get_lang
_title_pat = re.compile('^(A|The|An)\s+', re.IGNORECASE) if lang is None:
q = tweaks['default_language_for_title_sort']
if q is None:
q = get_lang()
q = canonicalize_lang(q) if q else q
data = tweaks['per_language_title_sort_articles']
ans = data.get(q, None)
if ans is None:
ans = data['eng']
ans = frozenset(ans + data['eng'])
ans = '|'.join(ans)
ans = '^(%s)'%ans
ans = re.compile(ans, re.IGNORECASE)
_title_pats[lang] = ans
return ans
_ignore_starts = u'\'"'+u''.join(unichr(x) for x in range(0x2018, 0x201e)+[0x2032, 0x2033]) _ignore_starts = u'\'"'+u''.join(unichr(x) for x in
range(0x2018, 0x201e)+[0x2032, 0x2033])
def title_sort(title, order=None): def title_sort(title, order=None, lang=None):
if order is None: if order is None:
order = tweaks['title_series_sorting'] order = tweaks['title_series_sorting']
title = title.strip() title = title.strip()
@ -114,7 +129,7 @@ def title_sort(title, order=None):
return title return title
if title and title[0] in _ignore_starts: if title and title[0] in _ignore_starts:
title = title[1:] title = title[1:]
match = _title_pat.search(title) match = get_title_sort_pat(lang).search(title)
if match: if match:
try: try:
prep = match.group(1) prep = match.group(1)

View File

@ -15,7 +15,7 @@ from calibre.customize import Plugin
from calibre.utils.logging import ThreadSafeLog, FileStream from calibre.utils.logging import ThreadSafeLog, FileStream
from calibre.utils.config import JSONConfig from calibre.utils.config import JSONConfig
from calibre.utils.titlecase import titlecase from calibre.utils.titlecase import titlecase
from calibre.utils.icu import capitalize, lower from calibre.utils.icu import capitalize, lower, upper
from calibre.ebooks.metadata import check_isbn from calibre.ebooks.metadata import check_isbn
msprefs = JSONConfig('metadata_sources/global.json') msprefs = JSONConfig('metadata_sources/global.json')
@ -121,7 +121,12 @@ def cap_author_token(token):
# Normalize tokens of the form J.K. to J. K. # Normalize tokens of the form J.K. to J. K.
parts = token.split('.') parts = token.split('.')
return '. '.join(map(capitalize, parts)).strip() return '. '.join(map(capitalize, parts)).strip()
return capitalize(token) ans = capitalize(token)
for x in ('-', "'"):
idx = ans.find(x)
if idx > -1 and len(ans) > idx+2:
ans = ans[:idx+1] + upper(ans[idx+1]) + ans[idx+2:]
return ans
def fixauthors(authors): def fixauthors(authors):
if not authors: if not authors:

View File

@ -11,7 +11,7 @@ import datetime
from urllib import quote_plus from urllib import quote_plus
from Queue import Queue, Empty from Queue import Queue, Empty
from lxml import etree, html from lxml import etree, html
from calibre import prints, as_unicode from calibre import as_unicode
from calibre.ebooks.chardet import xml_to_unicode from calibre.ebooks.chardet import xml_to_unicode
@ -54,7 +54,7 @@ class Ozon(Source):
def create_query(self, log, title=None, authors=None, identifiers={}): # {{{ def create_query(self, log, title=None, authors=None, identifiers={}): # {{{
# div_book -> search only books, ebooks and audio books # div_book -> search only books, ebooks and audio books
search_url = self.ozon_url + '/webservice/webservice.asmx/SearchWebService?searchContext=div_book&searchText=' search_url = self.ozon_url + '/webservice/webservice.asmx/SearchWebService?searchContext=div_book&searchText='
# for ozon.ru search we have to format ISBN with '-' # for ozon.ru search we have to format ISBN with '-'
isbn = _format_isbn(log, identifiers.get('isbn', None)) isbn = _format_isbn(log, identifiers.get('isbn', None))
# TODO: format isbn! # TODO: format isbn!
@ -79,7 +79,7 @@ class Ozon(Source):
return search_url return search_url
# }}} # }}}
def identify(self, log, result_queue, abort, title=None, authors=None, def identify(self, log, result_queue, abort, title=None, authors=None,
identifiers={}, timeout=30): # {{{ identifiers={}, timeout=30): # {{{
if not self.is_configured(): if not self.is_configured():
return return
@ -112,13 +112,13 @@ class Ozon(Source):
def get_metadata(self, log, entries, title, authors, identifiers): # {{{ def get_metadata(self, log, entries, title, authors, identifiers): # {{{
# some book titles have extra characters like this # some book titles have extra characters like this
# TODO: make a twick # TODO: make a twick
reRemoveFromTitle = None reRemoveFromTitle = None
#reRemoveFromTitle = re.compile(r'[?!:.,;+-/&%"\'=]') #reRemoveFromTitle = re.compile(r'[?!:.,;+-/&%"\'=]')
title = unicode(title).upper() if title else '' title = unicode(title).upper() if title else ''
if reRemoveFromTitle: if reRemoveFromTitle:
title = reRemoveFromTitle.sub('', title) title = reRemoveFromTitle.sub('', title)
authors = map(_normalizeAuthorNameWithInitials, authors = map(_normalizeAuthorNameWithInitials,
map(unicode.upper, map(unicode, authors))) if authors else None map(unicode.upper, map(unicode, authors))) if authors else None
ozon_id = identifiers.get('ozon', None) ozon_id = identifiers.get('ozon', None)
@ -320,7 +320,7 @@ class Ozon(Source):
displ_lang = lng_splt[0].strip() displ_lang = lng_splt[0].strip()
metadata.language = _translageLanguageToCode(displ_lang) metadata.language = _translageLanguageToCode(displ_lang)
#log.debug(u'language: %s'%displ_lang) #log.debug(u'language: %s'%displ_lang)
# can be set before from xml search responce # can be set before from xml search responce
if not metadata.pubdate: if not metadata.pubdate:
xpt = u'normalize-space(//div[@class="product-misc"]//text()[contains(., "г.")])' xpt = u'normalize-space(//div[@class="product-misc"]//text()[contains(., "г.")])'
@ -434,13 +434,13 @@ def _translageLanguageToCode(displayLang): # {{{
# [В.П. Колесников | Колесников В.П.]-> В. П. BКолесников # [В.П. Колесников | Колесников В.П.]-> В. П. BКолесников
def _normalizeAuthorNameWithInitials(name): # {{{ def _normalizeAuthorNameWithInitials(name): # {{{
res = name res = name
if name: if name:
re1 = u'^(?P<lname>\S+)\s+(?P<fname>[^\d\W]\.)(?:\s*(?P<mname>[^\d\W]\.))?$' re1 = u'^(?P<lname>\S+)\s+(?P<fname>[^\d\W]\.)(?:\s*(?P<mname>[^\d\W]\.))?$'
re2 = u'^(?P<fname>[^\d\W]\.)(?:\s*(?P<mname>[^\d\W]\.))?\s+(?P<lname>\S+)$' re2 = u'^(?P<fname>[^\d\W]\.)(?:\s*(?P<mname>[^\d\W]\.))?\s+(?P<lname>\S+)$'
matcher = re.match(re1, unicode(name), re.UNICODE) matcher = re.match(re1, unicode(name), re.UNICODE)
if not matcher: if not matcher:
matcher = re.match(re2, unicode(name), re.UNICODE) matcher = re.match(re2, unicode(name), re.UNICODE)
if matcher: if matcher:
d = matcher.groupdict() d = matcher.groupdict()
res = ' '.join(x for x in (d['fname'], d['mname'], d['lname']) if x) res = ' '.join(x for x in (d['fname'], d['mname'], d['lname']) if x)

View File

@ -302,7 +302,18 @@ class MobiWriter(object):
def generate_record0(self): # MOBI header {{{ def generate_record0(self): # MOBI header {{{
metadata = self.oeb.metadata metadata = self.oeb.metadata
exth = self.build_exth() bt = 0x002
if self.primary_index_record_idx is not None:
if False and self.indexer.is_flat_periodical:
# Disabled as setting this to 0x102 causes the Kindle to not
# auto archive the issues
bt = 0x102
elif self.indexer.is_periodical:
# If you change this, remember to change the cdetype in the EXTH
# header as well
bt = 0x103 if self.indexer.is_flat_periodical else 0x101
exth = self.build_exth(bt)
first_image_record = None first_image_record = None
if self.image_records: if self.image_records:
first_image_record = len(self.records) first_image_record = len(self.records)
@ -351,17 +362,6 @@ class MobiWriter(object):
# 0x10 - 0x13 : UID # 0x10 - 0x13 : UID
# 0x14 - 0x17 : Generator version # 0x14 - 0x17 : Generator version
bt = 0x002
if self.primary_index_record_idx is not None:
if False and self.indexer.is_flat_periodical:
# Disabled as setting this to 0x102 causes the Kindle to not
# auto archive the issues
bt = 0x102
elif self.indexer.is_periodical:
# If you change this, remember to change the cdetype in the EXTH
# header as well
bt = {'newspaper':0x101}.get(self.publication_type, 0x103)
record0.write(pack(b'>IIIII', record0.write(pack(b'>IIIII',
0xe8, bt, 65001, uid, 6)) 0xe8, bt, 65001, uid, 6))
@ -479,7 +479,7 @@ class MobiWriter(object):
self.records[0] = align_block(record0) self.records[0] = align_block(record0)
# }}} # }}}
def build_exth(self): # EXTH Header {{{ def build_exth(self, mobi_doctype): # EXTH Header {{{
oeb = self.oeb oeb = self.oeb
exth = StringIO() exth = StringIO()
nrecs = 0 nrecs = 0
@ -535,16 +535,17 @@ class MobiWriter(object):
nrecs += 1 nrecs += 1
# Write cdetype # Write cdetype
if not self.is_periodical and not self.opts.share_not_sync: if not self.is_periodical:
exth.write(pack(b'>II', 501, 12)) if not self.opts.share_not_sync:
exth.write(b'EBOK') exth.write(pack(b'>II', 501, 12))
nrecs += 1 exth.write(b'EBOK')
nrecs += 1
else: else:
# Should be b'NWPR' for doc type of 0x101 and b'MAGZ' for doctype ids = {0x101:b'NWPR', 0x103:b'MAGZ'}.get(mobi_doctype, None)
# of 0x103 but the old writer didn't write them, and I dont know if ids:
# what it should be for type 0x102 (b'BLOG'?) so write nothing exth.write(pack(b'>II', 501, 12))
# instead exth.write(ids)
pass nrecs += 1
# Add a publication date entry # Add a publication date entry
if oeb.metadata['date']: if oeb.metadata['date']:

View File

@ -965,16 +965,22 @@ class Manifest(object):
try: try:
data = etree.fromstring(data, parser=parser) data = etree.fromstring(data, parser=parser)
except etree.XMLSyntaxError: except etree.XMLSyntaxError:
self.oeb.logger.warn('Stripping comments and meta tags from %s'% self.oeb.logger.warn('Stripping comments from %s'%
self.href) self.href)
data = re.compile(r'<!--.*?-->', re.DOTALL).sub('', data = re.compile(r'<!--.*?-->', re.DOTALL).sub('',
data) data)
data = re.sub(r'<meta\s+[^>]+?>', '', data)
data = data.replace( data = data.replace(
"<?xml version='1.0' encoding='utf-8'?><o:p></o:p>", "<?xml version='1.0' encoding='utf-8'?><o:p></o:p>",
'') '')
data = data.replace("<?xml version='1.0' encoding='utf-8'??>", '') data = data.replace("<?xml version='1.0' encoding='utf-8'??>", '')
data = etree.fromstring(data, parser=RECOVER_PARSER) try:
data = etree.fromstring(data,
parser=RECOVER_PARSER)
except etree.XMLSyntaxError:
self.oeb.logger.warn('Stripping meta tags from %s'%
self.href)
data = re.sub(r'<meta\s+[^>]+?>', '', data)
data = etree.fromstring(data, parser=RECOVER_PARSER)
elif namespace(data.tag) != XHTML_NS: elif namespace(data.tag) != XHTML_NS:
# OEB_DOC_NS, but possibly others # OEB_DOC_NS, but possibly others
ns = namespace(data.tag) ns = namespace(data.tag)

View File

@ -11,9 +11,8 @@ import operator
import math import math
from collections import defaultdict from collections import defaultdict
from lxml import etree from lxml import etree
from calibre.ebooks.oeb.base import XHTML, XHTML_NS from calibre.ebooks.oeb.base import (XHTML, XHTML_NS, CSS_MIME, OEB_STYLES,
from calibre.ebooks.oeb.base import CSS_MIME, OEB_STYLES namespace, barename, XPath)
from calibre.ebooks.oeb.base import namespace, barename
from calibre.ebooks.oeb.stylizer import Stylizer from calibre.ebooks.oeb.stylizer import Stylizer
COLLAPSE = re.compile(r'[ \t\r\n\v]+') COLLAPSE = re.compile(r'[ \t\r\n\v]+')
@ -232,7 +231,10 @@ class CSSFlattener(object):
cssdict['text-align'] = val cssdict['text-align'] = val
del node.attrib['align'] del node.attrib['align']
if node.tag == XHTML('font'): if node.tag == XHTML('font'):
node.tag = XHTML('span') tags = ['descendant::h:%s'%x for x in ('p', 'div', 'table', 'h1',
'h2', 'h3', 'h4', 'h5', 'h6', 'ol', 'ul', 'dl', 'blockquote')]
tag = 'div' if XPath('|'.join(tags))(node) else 'span'
node.tag = XHTML(tag)
if 'size' in node.attrib: if 'size' in node.attrib:
def force_int(raw): def force_int(raw):
return int(re.search(r'([0-9+-]+)', raw).group(1)) return int(re.search(r'([0-9+-]+)', raw).group(1))

View File

@ -17,7 +17,7 @@ from calibre.ebooks.metadata import fmt_sidx, authors_to_string, string_to_autho
from calibre.ebooks.metadata.book.base import SafeFormat from calibre.ebooks.metadata.book.base import SafeFormat
from calibre.ptempfile import PersistentTemporaryFile from calibre.ptempfile import PersistentTemporaryFile
from calibre.utils.config import tweaks, prefs from calibre.utils.config import tweaks, prefs
from calibre.utils.date import dt_factory, qt_to_dt from calibre.utils.date import dt_factory, qt_to_dt, as_local_time
from calibre.utils.icu import sort_key from calibre.utils.icu import sort_key
from calibre.utils.search_query_parser import SearchQueryParser from calibre.utils.search_query_parser import SearchQueryParser
from calibre.library.caches import (_match, CONTAINS_MATCH, EQUALS_MATCH, from calibre.library.caches import (_match, CONTAINS_MATCH, EQUALS_MATCH,
@ -580,7 +580,7 @@ class BooksModel(QAbstractTableModel): # {{{
def datetime_type(r, idx=-1): def datetime_type(r, idx=-1):
val = self.db.data[r][idx] val = self.db.data[r][idx]
if val is not None: if val is not None:
return QVariant(QDateTime(val)) return QVariant(QDateTime(as_local_time(val)))
else: else:
return QVariant(UNDEFINED_QDATETIME) return QVariant(UNDEFINED_QDATETIME)

View File

@ -138,9 +138,10 @@ class TitleSortEdit(TitleEdit):
' For example, The Exorcist might be sorted as Exorcist, The.') ' For example, The Exorcist might be sorted as Exorcist, The.')
LABEL = _('Title &sort:') LABEL = _('Title &sort:')
def __init__(self, parent, title_edit, autogen_button): def __init__(self, parent, title_edit, autogen_button, languages_edit):
TitleEdit.__init__(self, parent) TitleEdit.__init__(self, parent)
self.title_edit = title_edit self.title_edit = title_edit
self.languages_edit = languages_edit
base = self.TOOLTIP base = self.TOOLTIP
ok_tooltip = '<p>' + textwrap.fill(base+'<br><br>'+ ok_tooltip = '<p>' + textwrap.fill(base+'<br><br>'+
@ -157,10 +158,20 @@ class TitleSortEdit(TitleEdit):
self.autogen_button = autogen_button self.autogen_button = autogen_button
autogen_button.clicked.connect(self.auto_generate) autogen_button.clicked.connect(self.auto_generate)
languages_edit.editTextChanged.connect(self.update_state)
languages_edit.currentIndexChanged.connect(self.update_state)
self.update_state() self.update_state()
@property
def book_lang(self):
try:
book_lang = self.languages_edit.lang_codes[0]
except:
book_lang = None
return book_lang
def update_state(self, *args): def update_state(self, *args):
ts = title_sort(self.title_edit.current_val) ts = title_sort(self.title_edit.current_val, lang=self.book_lang)
normal = ts == self.current_val normal = ts == self.current_val
if normal: if normal:
col = 'rgb(0, 255, 0, 20%)' col = 'rgb(0, 255, 0, 20%)'
@ -173,7 +184,8 @@ class TitleSortEdit(TitleEdit):
self.setWhatsThis(tt) self.setWhatsThis(tt)
def auto_generate(self, *args): def auto_generate(self, *args):
self.current_val = title_sort(self.title_edit.current_val) self.current_val = title_sort(self.title_edit.current_val,
lang=self.book_lang)
def break_cycles(self): def break_cycles(self):
try: try:

View File

@ -109,6 +109,9 @@ class MetadataSingleDialogBase(ResizableDialog):
def create_basic_metadata_widgets(self): # {{{ def create_basic_metadata_widgets(self): # {{{
self.basic_metadata_widgets = [] self.basic_metadata_widgets = []
self.languages = LanguagesEdit(self)
self.basic_metadata_widgets.append(self.languages)
self.title = TitleEdit(self) self.title = TitleEdit(self)
self.title.textChanged.connect(self.update_window_title) self.title.textChanged.connect(self.update_window_title)
self.deduce_title_sort_button = QToolButton(self) self.deduce_title_sort_button = QToolButton(self)
@ -119,7 +122,7 @@ class MetadataSingleDialogBase(ResizableDialog):
self.deduce_title_sort_button.setWhatsThis( self.deduce_title_sort_button.setWhatsThis(
self.deduce_title_sort_button.toolTip()) self.deduce_title_sort_button.toolTip())
self.title_sort = TitleSortEdit(self, self.title, self.title_sort = TitleSortEdit(self, self.title,
self.deduce_title_sort_button) self.deduce_title_sort_button, self.languages)
self.basic_metadata_widgets.extend([self.title, self.title_sort]) self.basic_metadata_widgets.extend([self.title, self.title_sort])
self.deduce_author_sort_button = b = QToolButton(self) self.deduce_author_sort_button = b = QToolButton(self)
@ -203,9 +206,6 @@ class MetadataSingleDialogBase(ResizableDialog):
self.publisher = PublisherEdit(self) self.publisher = PublisherEdit(self)
self.basic_metadata_widgets.append(self.publisher) self.basic_metadata_widgets.append(self.publisher)
self.languages = LanguagesEdit(self)
self.basic_metadata_widgets.append(self.languages)
self.timestamp = DateEdit(self) self.timestamp = DateEdit(self)
self.pubdate = PubdateEdit(self) self.pubdate = PubdateEdit(self)
self.basic_metadata_widgets.extend([self.timestamp, self.pubdate]) self.basic_metadata_widgets.extend([self.timestamp, self.pubdate])
@ -282,7 +282,6 @@ class MetadataSingleDialogBase(ResizableDialog):
# Commented out as it doesn't play nice with Next, Prev buttons # Commented out as it doesn't play nice with Next, Prev buttons
#self.fetch_metadata_button.setFocus(Qt.OtherFocusReason) #self.fetch_metadata_button.setFocus(Qt.OtherFocusReason)
# Miscellaneous interaction methods {{{ # Miscellaneous interaction methods {{{
def update_window_title(self, *args): def update_window_title(self, *args):
title = self.title.current_val title = self.title.current_val

View File

@ -209,6 +209,12 @@ class CybookOrizon(CybookOpus):
name = 'Cybook Orizon' name = 'Cybook Orizon'
id = 'cybook_orizon' id = 'cybook_orizon'
class CybookOdyssey(CybookOpus):
name = 'Cybook Odyssey'
id = 'cybook_odyssey'
class PocketBook360(CybookOpus): class PocketBook360(CybookOpus):
manufacturer = 'PocketBook' manufacturer = 'PocketBook'

View File

@ -332,8 +332,8 @@ def do_remove(db, ids):
for y in x: for y in x:
db.delete_book(y) db.delete_book(y)
send_message()
db.clean() db.clean()
send_message()
def remove_option_parser(): def remove_option_parser():
return get_parser(_( return get_parser(_(
@ -358,7 +358,7 @@ def command_remove(args, dbpath):
for x in args[1].split(','): for x in args[1].split(','):
y = x.split('-') y = x.split('-')
if len(y) > 1: if len(y) > 1:
ids.append(range(int(y[0], int(y[1])))) ids.extend(range(int(y[0]), int(y[1])))
else: else:
ids.append(int(y[0])) ids.append(int(y[0]))

View File

@ -15,7 +15,7 @@ from math import ceil
from calibre import prints from calibre import prints
from calibre.ebooks.metadata import (title_sort, author_to_author_sort, from calibre.ebooks.metadata import (title_sort, author_to_author_sort,
string_to_authors, authors_to_string) string_to_authors, authors_to_string, get_title_sort_pat)
from calibre.ebooks.metadata.opf2 import metadata_to_opf from calibre.ebooks.metadata.opf2 import metadata_to_opf
from calibre.library.database import LibraryDatabase from calibre.library.database import LibraryDatabase
from calibre.library.field_metadata import FieldMetadata, TagsIcons from calibre.library.field_metadata import FieldMetadata, TagsIcons
@ -1004,10 +1004,11 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns):
return False return False
def find_identical_books(self, mi): def find_identical_books(self, mi):
fuzzy_title_patterns = [(re.compile(pat, re.IGNORECASE), repl) for pat, repl in fuzzy_title_patterns = [(re.compile(pat, re.IGNORECASE) if
isinstance(pat, basestring) else pat, repl) for pat, repl in
[ [
(r'[\[\](){}<>\'";,:#]', ''), (r'[\[\](){}<>\'";,:#]', ''),
(tweaks.get('title_sort_articles', r'^(a|the|an)\s+'), ''), (get_title_sort_pat(), ''),
(r'[-._]', ' '), (r'[-._]', ' '),
(r'\s+', ' ') (r'\s+', ' ')
] ]

View File

@ -117,8 +117,8 @@ How can I help get my device supported in |app|?
If your device appears as a USB disk to the operating system, adding support for it to |app| is very easy. If your device appears as a USB disk to the operating system, adding support for it to |app| is very easy.
We just need some information from you: We just need some information from you:
* What ebook formats does your device support? * Complete list of ebook formats that your device supports.
* Is there a special directory on the device in which all ebook files should be placed? * Is there a special directory on the device in which all ebook files should be placed? Also does the device detect files placed in sub directories?
* We also need information about your device that |app| will collect automatically. First, if your * We also need information about your device that |app| will collect automatically. First, if your
device supports SD cards, insert them. Then connect your device to the computer. In calibre go to :guilabel:`Preferences->Advanced->Miscellaneous` device supports SD cards, insert them. Then connect your device to the computer. In calibre go to :guilabel:`Preferences->Advanced->Miscellaneous`
and click the "Debug device detection" button. This will create some debug output. Copy it to a file and click the "Debug device detection" button. This will create some debug output. Copy it to a file

File diff suppressed because it is too large Load Diff

View File

@ -339,6 +339,7 @@ class TemplateFormatter(string.Formatter):
########## a formatter that throws exceptions ############ ########## a formatter that throws exceptions ############
def unsafe_format(self, fmt, kwargs, book): def unsafe_format(self, fmt, kwargs, book):
self.column_name = self.template_cache = None
self.kwargs = kwargs self.kwargs = kwargs
self.book = book self.book = book
self.composite_values = {} self.composite_values = {}