diff --git a/Changelog.yaml b/Changelog.yaml index a0c41bac4b..21b92493a7 100644 --- a/Changelog.yaml +++ b/Changelog.yaml @@ -19,6 +19,433 @@ # new recipes: # - title: +- version: 0.8.28 + date: 2011-11-25 + + new features: + - title: "Get Books: Add litres.ru store" + + - title: "Change the algorithm that generates title sort strings to strip leading articles from both english and the current language set for the calibre user interface. In addition, in the edit metadata dialog, calibre will use the book's language when calculating the sort string. This behavior can be adjusted via Preferences->Tweaks." + tickets: [886763] + + - title: "Driver for Cybook Odyssey." + tickets: [893457] + + - title: "Irex driver: Put books into the top level directory instead of into /ebooks or /Books." + tickets: [883616] + + bug fixes: + - title: "Have downloaded periodicals recognized when transferred via USB to the Kindle Fire" + + - title: "MOBI Output: Fix underline and strikethrough properties declared on parents not being rendered on child tags." + tickets: [894245] + + - title: "Template language: Fix regression that broke ordering of items when formatting a list" + + - title: "Conversion pipeline: When removing obsolete tags convert them to
instead of if they contain block level tags." + tickets: [892525] + + - title: "When downloading metadata, fix the case normalization of double-barelled author names." + tickets: [893257] + + - title: "Template language: Fix regression that broke using general program mode in save to disk templates" + + - title: "calibredb: Fix use of ranges when specifying ids for the remove command" + + - title: "Apple driver: Add ids for iPhone 4S. More robust against iTunes automation errors when adding artwork." + tickets: [892468] + + - title: "Fix encoding of comments incorrectly detected when downloading metadata from ozon.ru" + + - title: "Fix calibre not getting list of books on the Kindle Fire" + + improved recipes: + - El Mundo + - BBC + - NIN Online + - ABC Australia + - Salon.com + - Expansion (Spanish) + - The Week + - Heise Online + + new recipes: + - title: Give me something to read and Let's get Critical + author: Barty + + - title: Worldcrunch + author: Krittika Goyal + +- version: 0.8.27 + date: 2011-11-18 + + new features: + - title: "Drivers for the Kindle Fire and the Nook Tablet" + tickets: [890918] + + - title: "Conversion: Add an option under Look & Feel to remove specified style information (CSS) from the document during conversion." + tickets: [871384] + + - title: "Add an option in the bulk metadata edit dialog to restore the pre-conversion files for many books with a single click." + tickets: [886116] + + - title: "Jobs list: Add the ability to search for and to hide jobs, useful if you have run a lot of jobs and the list is getting crowded." + tickets: [883734] + + - title: "Book jacket generation: Add ability to customize the book jacket template and add custom columns into the jacket." + tickets: [889912] + + - title: "MOBI Input: Performance improvement when viewing/converting a file with a lot of links" + + bug fixes: + - title: "Fix regression in 0.8.26 that broke disabling the update of particular fields during a bulk metadata download." + tickets: [889696] + + - title: "Get Books: Fix DRM status for legimi" + + - title: "When parsing for lxml via BeatifulSoup, use the calibre modified copy of BeautifulSoup (more robust)." + tickets: [889890] + + - title: "HTML Input: Handle double encoded URLs in img tags" + tickets: [889323] + + improved recipes: + - Various Polish recipes + - Academia Catavencu + - El Periodico de Aragon + - Weblogs SL + - Folha de Sao Paolo (subscription) + + new recipes: + - title: News on Japan + author: Krittika Goyal + + - title: Formula AS + author: Silviu Cotoara + + - title: Various Turkish news sources + author: Osman Kaysan + + - title: Infra.pl and Spider's Web + author: fenuks + + +- version: 0.8.26 + date: 2011-11-12 + + new features: + - title: "Tweak to control sorting of date type columns. You can choose to have them sorted only by displayed fields" + + - title: "Driver for the Trekstor 3.0" + + - title: "Performance improvements when evaluating templates, and in particular general program mode templates" + + bug fixes: + - title: "ODT Input: When converting to EPUB improve handling of large images placed inside small frames, to prevent them from obscuring text." + tickets: [860272,884759] + + - title: "EPUB Input: Automatically strip entries of type application/text from the spine. Apparently there are EPUB production tools out there that create them." + tickets: [884792] + + - title: "Keep the startup splash screen visible until the GUI has fully completed initializing." + tickets: [885827] + + - title: "ODT Input: Fix handling of span tags containing only whitespace." + tickets: [887311] + + - title: "On windows when changing title or author via the main book list, handle the case of one of the books files being open in another program more gracefully." + tickets: [880585] + + - title: "When adding a format to an existing book record, ensure that no changes are made to the database until after the file operations have succeeded." + + - title: "Fix bug that prevented configuring which fields to download metadata for when adding books by ISBN" + tickets: [856076] + + - title: "Fix Japanese characters not being crrectly displayed on index pages in news downloads for the SONY T1" + tickets: [888029] + + - title: "Get Books: Fix booleans in search expressions not working in non-English calibre versions" + tickets: [887554] + + - title: "Fix a bug in the support for hours/minutes/seconds in datetime format strings" + tickets: [887412] + + - title: "Treat an author_sort value of 'Unknown' the same way as unknown authors are treated in template processing" + + - title: "Detect SD card in Kobo Vox" + + - title: "Amazon metadata download: Workaround for change in Amazon website causing some books to have incorrect ratings downloaded" + + improved recipes: + - Metro NL + - The Independent + - Frankfurter Rundschau + - L'Espresso + - Il Giornale + - Berlingske.dk + - Suedeutsche Zeitung + + new recipes: + - title: Techtarget + author: Julio Map + +- version: 0.8.25 + date: 2011-11-06 + + new features: + - title: "Drivers for the LG Optimus 2X, HTC Incredible S, Samsung Stratosphere and the Kobo Vox" + tickets: [886558, 885058, 884762, 884039] + + - title: "Get books: Add ebookpoint.pl store" + + - title: "Support hour/minute/seconds in datetime format strings in the template language and in tweaks" + + bug fixes: + - title: "Fix Book detils preferences showing custom columns even after they have been deleted" + tickets: [884799] + + - title: "Replace use of insecure tempfile in the bundled rtf2xml library." + tickets: [885245] + + - title: "Remove the suid mount helper used on linux and bsd, as it proved impossible to make it secure." + description: "This means that if you are on BSD or an older linux distribution, without support for udisks, device detection will no longer work in calibre. You will have to either mount the devices by hand before starting calibre, or stick with version 0.8.24 (the vulnerability in the mount helper is a privilege escalation, which is relatively harmless on the vast majority of single user systems)." + tickets: [885027] + + - title: "Do not error out if there is an invalid regex for title sort set in tweaks" + + - title: "Content server: Fix another place where --url-prefix was forgotten" + tickets: [885332] + + - title: "HTML Input: Limit memory consumption when converting HTML files that link to large binary files." + tickets: [884821] + + - title: "T1 driver: Workaround for T1 showing error messages when opening some news downloads on the device" + + - title: "Kobo driver: Fix longstanding bug that would prevent re-adding a epub that has been previously deleted from the Kobo using calibre" + + - title: "Fix partial cover search not resuming after pressing back in the metadata download dialog" + tickets: [875196] + + - title: "T1 driver: Fix auto refresh covers option" + + - title: "Content server: Do not show tracebacks in HTML output when not running in develop mode" + + - title: "Textile output; Fix out of memory issue when dealing with large margins." + + improved recipes: + - The Independent + - Die Zeit subscription version + - NIN online + - Science News + - Updated Daily Mirror + - Science AAAS + + new recipes: + - title: b365 Realitatea and Catavencii + author: Silviu Cotoara + + - title: Various Greek news sources + author: Stelios + + - title: Real world economics blog + author: Julio Map + +- version: 0.8.24 + date: 2011-10-27 + + new features: + - title: "Kobo: Add support for fetching annotations from the kobo reader." + description: "Right click the send to device button in calibre with your kobo connected and choose fetch annotations. The annotations are placed into the comments of the corresponding books in the calibre library. This feature is still experimental." + type: major + + - title: "Preserve the set of selected books in the library view when a device is connected, fixing a long standing annoyance" + + bug fixes: + - title: "Prevent changing of device metadata management option while a device is connected." + tickets: [874118] + + - title: "Book details panel: Show tooltip only when hovering over cover, not the rest of the book information, as it makes it hard to read." + tickets: [876454] + + - title: "MOBI Output: Fix use of list elements as link anchors caused links to always point to start of list." + tickets: [879391] + + - title: "RB Output: Fix calibre generated rb files not being opened by the RocketBook." + tickets: [880930] + + - title: "FB2 Input: Dont choke on FB2 files that have empty embedded content tags." + tickets: [880904] + + - title: "ODT Input: CSS rationalization should not fail with non ascii class names" + + - title: "Fix creating new library using the copy structure option incorrectly setting all text type columns to be like the tags column" + + - title: "E-book viewer: Don't choke on windows installs with a non UTF-8 filesystem encoding." + tickets: [879740] + + + improved recipes: + - Novaya Gazeta + - El Universal (Venezuela) + - The Australian (subscription enabled) + - Metro NL + - The Scotsman + - Japan Times + + new recipes: + - title: Silicon Republic + author: Neil Grogan + + - title: Calibre Blog + author: Krittika Goyal + +- version: 0.8.23 + date: 2011-10-21 + + new features: + - title: "Drivers for T-Mobile Move, new Pandigital Novel, New Onyx Boox and Freescale MX 515" + + - title: "SONY T1 driver: Support for periodicals and better timezone detection" + + - title: "Add a remove cover entry to the right click menu of the cover display in the right panel" + tickets: [874689] + + bug fixes: + - title: "Amazon metadata download: Fix for change in Amazon website that broke downloading metadata." + tickets: [878395] + + - title: "MOBI metadata: When reading titles from MOBI files only use the title in the PDB header if there is no long title in the EXTH header" + tickets: [ 875243 ] + + - title: "Fix regression that broke use of complex custom columns in save to disk templates." + tickets: [877366] + + - title: "Fix regression that broke reading metadata from CHM files" + + - title: "Fix a bug that broke conversion of some zipped up HTML files with non ascii filenames on certain windows installs." + tickets: [873288] + + - title: "RTF Input: Fix bug in handling of paragraph separators." + tickets: [863735] + + - title: "Fix a regression that broke downloading certain periodicals for the Kindle." + tickets: [875595] + + - title: "Fix regression that broke updating of covers inside ebook files when saving to disk" + + - title: "Fix regression breaking editing the 'show in tag browser' checkbox in custom column setup editing" + + - title: "Fix typo that broke stopping selected jobs in 0.8.22" + + improved recipes: + - Columbus Dispatch + - Ming Pao + - La Republica + - Korea Times + - USA Today + - CNN + - Liberation + - El Pais + - Helsingin Sanomat + + new recipes: + - title: Kyugyhang, Hankyoreh and Hankyoreh21 + author: Seongkyoun Yoo. + + - title: English Katherimini + author: Thomas Scholl + + - title: Various French news sources + author: Aurelien Chabot. + +- version: 0.8.22 + date: 2011-10-14 + + new features: + - title: "Input plugin for OCR-ed DJVU files (i.e. .djvu files that contain text. Only the text is converted)" + type: major + + - title: "Driver for the SONY PRS T1" + + - title: "Add a 'Back' button to the metadata download dialog while downloading covers, so that you can go back and select a different match if you dont lke the covers, instead of having to re-do the entire download." + tickets: [855055] + + - title: "Add an option in Preferences->Saving to disk to not show files in file browser after saving to disk" + + - title: "Get Books: Add the amazon.fr store. Remove leading 'by' from author names. Fix encoding issues with non English titles/names" + + - title: "Driver for Onyx BOOX A61S/X61S" + tickets: [872741] + + - title: "Kobo: Add support for uploading new covers to the device without converting the ePub. You can just resend the book to have the cover updated" + + - title: "Make it a little harder to ignore the fact that there are multiple toolbars when customizing toolbars" + tickets: [864589] + + bug fixes: + - title: "MOBI Input: Remove invalid tags of the form " + tickets: [872883] + + - title: "calibredb add_format does not refresh running calibre instance" + tickets: [872961] + + - title: "Conversion pipeline: Translate to CSS font-family" + tickets: [871388] + + - title: "When sending email add a Date: header so that amavis does not consider the emails to be spam" + + - title: "Fix for the problem where setting the restriction to an empty current search clears the restriction box but does not clear the restriction." + tickets: [871921] + + - title: "Fix generation of column coloring rules for date/time columns" + + - title: "Fix plugboard problem where customizations to formats accepted by a device were ignored." + + - title: "Enable adding of various actions to the toolbar when device is connected (they had been erroneously marked as being non-addable)" + + - title: "Fixable content in library check is not hidden after repair" + tickets: [864096] + + - title: "Catalog generation: Handle a corrupted thumbnail cache." + + - title: "Do not error out when user clicks stop selected job with no job selected." + tickets: [863766] + + improved recipes: + - automatiseringgids + - CNET + - Geek and Poke + - Gosc Niedzielny + - Dilbert + - Economist + - Ming Pao + - Metro UK + - Heise Online + - FAZ.net + - Houston Chronicle + - Slate + - Descopera + + new recipes: + - title: WoW Insider + author: Krittika Goyal + + - title: Merco Press and Penguin news + author: Russell Phillips + + - title: Defense News + author: Darko Miletic + + - title: Revista Piaui + author: Eduardo Simoes + + - title: Dark Horizons + author: Jaded + + - title: Various polish news sources + author: fenuks + + - version: 0.8.21 date: 2011-09-30 diff --git a/recipes/20minutes.recipe b/recipes/20minutes.recipe new file mode 100644 index 0000000000..683f89fac9 --- /dev/null +++ b/recipes/20minutes.recipe @@ -0,0 +1,70 @@ +# -*- coding: utf-8 -*- +__license__ = 'GPL v3' +__copyright__ = '2011 Aurélien Chabot ' +''' +20minutes.fr +''' +from calibre.web.feeds.recipes import BasicNewsRecipe + +class Minutes(BasicNewsRecipe): + + title = '20 minutes' + __author__ = 'calibre' + description = 'Actualités' + encoding = 'cp1252' + publisher = '20minutes.fr' + category = 'Actualités, France, Monde' + language = 'fr' + + use_embedded_content = False + timefmt = ' [%d %b %Y]' + max_articles_per_feed = 15 + no_stylesheets = True + remove_empty_feeds = True + filterDuplicates = True + + extra_css = ''' + h1 {font-size:xx-large; font-family:Arial,Helvetica,sans-serif;} + .mna-details {font-size:xx-small; color:#4D4D4D; font-family:Arial,Helvetica,sans-serif;} + .mna-image {font-size:xx-small; color:#4D4D4D; font-family:Arial,Helvetica,sans-serif;} + .mna-body {font-size:medium; font-family:Arial,Helvetica,sans-serif;} + ''' + + remove_tags = [ + dict(name='iframe'), + dict(name='div', attrs={'class':['mn-section-heading']}), + dict(name='a', attrs={'href':['#commentaires']}), + dict(name='div', attrs={'class':['mn-right']}), + dict(name='div', attrs={'class':['mna-box']}), + dict(name='div', attrs={'class':['mna-comment-call']}), + dict(name='div', attrs={'class':['mna-tools']}), + dict(name='div', attrs={'class':['mn-trilist']}) + ] + + keep_only_tags = [dict(id='mn-article')] + + remove_tags_after = dict(name='div', attrs={'class':['mna-body','mna-signature']}) + + + feeds = [ + ('France', 'http://www.20minutes.fr/rss/actu-france.xml'), + ('International', 'http://www.20minutes.fr/rss/monde.xml'), + ('Tech/Web', 'http://www.20minutes.fr/rss/hightech.xml'), + ('Sciences', 'http://www.20minutes.fr/rss/sciences.xml'), + ('Economie', 'http://www.20minutes.fr/rss/economie.xml'), + ('Politique', 'http://www.20minutes.fr/rss/politique.xml'), + (u'Médias', 'http://www.20minutes.fr/rss/media.xml'), + ('Cinema', 'http://www.20minutes.fr/rss/cinema.xml'), + ('People', 'http://www.20minutes.fr/rss/people.xml'), + ('Culture', 'http://www.20minutes.fr/rss/culture.xml'), + ('Sport', 'http://www.20minutes.fr/rss/sport.xml'), + ('Paris', 'http://www.20minutes.fr/rss/paris.xml'), + ('Lyon', 'http://www.20minutes.fr/rss/lyon.xml'), + ('Toulouse', 'http://www.20minutes.fr/rss/toulouse.xml') + ] + + + def preprocess_html(self, soup): + for item in soup.findAll(style=True): + del item['style'] + return soup diff --git a/recipes/abc_au.recipe b/recipes/abc_au.recipe index 1330f8e4b5..f08beb4dae 100644 --- a/recipes/abc_au.recipe +++ b/recipes/abc_au.recipe @@ -1,5 +1,5 @@ __license__ = 'GPL v3' -__copyright__ = '2010, Dean Cording' +__copyright__ = '2011, Pat Stapleton ' ''' abc.net.au/news ''' @@ -8,7 +8,7 @@ from calibre.web.feeds.recipes import BasicNewsRecipe class ABCNews(BasicNewsRecipe): title = 'ABC News' - __author__ = 'Dean Cording' + __author__ = 'Pat Stapleton, Dean Cording' description = 'News from Australia' masthead_url = 'http://www.abc.net.au/news/assets/v5/images/common/logo-news.png' cover_url = 'http://www.abc.net.au/news/assets/v5/images/common/logo-news.png' @@ -23,7 +23,9 @@ class ABCNews(BasicNewsRecipe): category = 'News, Australia, World' language = 'en_AU' publication_type = 'newsportal' - preprocess_regexps = [(re.compile(r'', re.DOTALL), lambda m: '')] +# preprocess_regexps = [(re.compile(r'', re.DOTALL), lambda m: '')] +#Remove annoying map links (inline-caption class is also used for some image captions! hence regex to match maps.google) + preprocess_regexps = [(re.compile(r'|]*>|)', re.DOTALL|re.IGNORECASE), lambda match: ''), diff --git a/recipes/b365realitatea.recipe b/recipes/b365realitatea.recipe new file mode 100644 index 0000000000..80a1ee225b --- /dev/null +++ b/recipes/b365realitatea.recipe @@ -0,0 +1,52 @@ +# -*- coding: utf-8 -*- + +__license__ = 'GPL v3' +__copyright__ = u'2011, Silviu Cotoar\u0103' +''' +b365.realitatea.net +''' + +from calibre.web.feeds.news import BasicNewsRecipe + +class b365Realitatea(BasicNewsRecipe): + title = u'b365 Realitatea' + __author__ = u'Silviu Cotoar\u0103' + publisher = u'b365 Realitatea' + description = u'b365 Realitatea' + oldest_article = 5 + language = 'ro' + max_articles_per_feed = 100 + no_stylesheets = True + use_embedded_content = False + category = 'Ziare,Romania,Bucuresti' + encoding = 'utf-8' + cover_url = 'http://b365.realitatea.net/wp-content/themes/b/images/b365-logo.png' + + conversion_options = { + 'comments' : description + ,'tags' : category + ,'language' : language + ,'publisher' : publisher + } + + keep_only_tags = [ + dict(name='div', attrs={'class':'newsArticle'}) + ] + + remove_tags = [ + dict(name='div', attrs={'class':'date'}) + , dict(name='dic', attrs={'class':'addthis_toolbox addthis_default_style'}) + , dict(name='div', attrs={'class':'related_posts'}) + , dict(name='div', attrs={'id':'RelevantiWidget'}) + ] + + remove_tags_after = [ + dict(name='div', attrs={'id':'RelevantiWidget'}) + ] + feeds = [ + (u'\u0218tiri', u'http://b365.realitatea.net/rss-full/') + ] + + def preprocess_html(self, soup): + return self.adeify_images(soup) + diff --git a/recipes/bbc.recipe b/recipes/bbc.recipe index 2bccbaf4ae..9b2d4854bb 100644 --- a/recipes/bbc.recipe +++ b/recipes/bbc.recipe @@ -1,61 +1,648 @@ -__license__ = 'GPL v3' -__copyright__ = '2010, Darko Miletic ' +## +## Title: BBC News, Sport, and Blog Calibre Recipe +## Contact: mattst - jmstanfield@gmail.com +## +## License: GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html +## Copyright: mattst - jmstanfield@gmail.com +## +## Written: November 2011 +## Last Edited: 2011-11-19 +## + +__license__ = 'GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html' +__copyright__ = 'mattst - jmstanfield@gmail.com' + + ''' -news.bbc.co.uk +BBC News, Sport, and Blog Calibre Recipe ''' + +# Import the regular expressions module. import re + +# Import the BasicNewsRecipe class which this class extends. from calibre.web.feeds.recipes import BasicNewsRecipe -class BBC(BasicNewsRecipe): - title = 'BBC News' - __author__ = 'Darko Miletic, Starson17' - description = 'News from UK. ' - oldest_article = 2 - max_articles_per_feed = 100 - no_stylesheets = True - #delay = 1 - use_embedded_content = False - encoding = 'utf8' - publisher = 'BBC' - category = 'news, UK, world' - language = 'en_GB' - publication_type = 'newsportal' - extra_css = ' body{ font-family: Verdana,Helvetica,Arial,sans-serif } .introduction{font-weight: bold} .story-feature{display: block; padding: 0; border: 1px solid; width: 40%; font-size: small} .story-feature h2{text-align: center; text-transform: uppercase} ' - preprocess_regexps = [(re.compile(r'', re.DOTALL), lambda m: '')] - conversion_options = { - 'comments' : description - ,'tags' : category - ,'language' : language - ,'publisher' : publisher - ,'linearize_tables': True +class BBCNewsSportBlog(BasicNewsRecipe): + + # + # **** IMPORTANT USERS READ ME **** + # + # First select the feeds you want then scroll down below the feeds list + # and select the values you want for the other user preferences, like + # oldest_article and such like. + # + # + # Select the BBC rss feeds which you want in your ebook. + # Selected feed have NO '#' at their start, de-selected feeds begin with a '#'. + # + # Eg. ("News Home", "http://feeds.bbci.co.uk/... - include feed. + # Eg. #("News Home", "http://feeds.bbci.co.uk/... - do not include feed. + # + # There are 68 feeds below which constitute the bulk of the available rss + # feeds on the BBC web site. These include 5 blogs by editors and + # correspondants, 16 sports feeds, 15 'sub' regional feeds (Eg. North West + # Wales, Scotland Business), and 7 Welsh language feeds. + # + # Some of the feeds are low volume (Eg. blogs), or very low volume (Eg. Click) + # so if "oldest_article = 1.5" (only articles published in the last 36 hours) + # you may get some 'empty feeds' which will not then be included in the ebook. + # + # The 15 feeds currently selected below are simply my default ones. + # + # Note: With all 68 feeds selected, oldest_article set to 2, + # max_articles_per_feed set to 100, and simultaneous_downloads set to 10, + # the ebook creation took 29 minutes on my speedy 100 mbps net connection, + # fairly high-end desktop PC running Linux (Ubuntu Lucid-Lynx). + # More realistically with 15 feeds selected, oldest_article set to 1.5, + # max_articles_per_feed set to 100, and simultaneous_downloads set to 20, + # it took 6 minutes. If that's too slow increase 'simultaneous_downloads'. + # + # Select / de-select the feeds you want in your ebook. + # + feeds = [ + ("News Home", "http://feeds.bbci.co.uk/news/rss.xml"), + ("UK", "http://feeds.bbci.co.uk/news/uk/rss.xml"), + ("World", "http://feeds.bbci.co.uk/news/world/rss.xml"), + #("England", "http://feeds.bbci.co.uk/news/england/rss.xml"), + #("Scotland", "http://feeds.bbci.co.uk/news/scotland/rss.xml"), + #("Wales", "http://feeds.bbci.co.uk/news/wales/rss.xml"), + #("N. Ireland", "http://feeds.bbci.co.uk/news/northern_ireland/rss.xml"), + #("Africa", "http://feeds.bbci.co.uk/news/world/africa/rss.xml"), + #("Asia", "http://feeds.bbci.co.uk/news/world/asia/rss.xml"), + #("Europe", "http://feeds.bbci.co.uk/news/world/europe/rss.xml"), + #("Latin America", "http://feeds.bbci.co.uk/news/world/latin_america/rss.xml"), + #("Middle East", "http://feeds.bbci.co.uk/news/world/middle_east/rss.xml"), + ("US & Canada", "http://feeds.bbci.co.uk/news/world/us_and_canada/rss.xml"), + ("Politics", "http://feeds.bbci.co.uk/news/politics/rss.xml"), + ("Science/Environment", "http://feeds.bbci.co.uk/news/science_and_environment/rss.xml"), + ("Technology", "http://feeds.bbci.co.uk/news/technology/rss.xml"), + ("Magazine", "http://feeds.bbci.co.uk/news/magazine/rss.xml"), + ("Entertainment/Arts", "http://feeds.bbci.co.uk/news/entertainment_and_arts/rss.xml"), + #("Health", "http://feeds.bbci.co.uk/news/health/rss.xml"), + #("Education/Family", "http://feeds.bbci.co.uk/news/education/rss.xml"), + ("Business", "http://feeds.bbci.co.uk/news/business/rss.xml"), + ("Special Reports", "http://feeds.bbci.co.uk/news/special_reports/rss.xml"), + ("Also in the News", "http://feeds.bbci.co.uk/news/also_in_the_news/rss.xml"), + #("Newsbeat", "http://www.bbc.co.uk/newsbeat/rss.xml"), + #("Click", "http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/programmes/click_online/rss.xml"), + ("Blog: Nick Robinson (Political Editor)", "http://feeds.bbci.co.uk/news/correspondents/nickrobinson/rss.sxml"), + #("Blog: Mark D'Arcy (Parliamentary Correspondent)", "http://feeds.bbci.co.uk/news/correspondents/markdarcy/rss.sxml"), + #("Blog: Robert Peston (Business Editor)", "http://feeds.bbci.co.uk/news/correspondents/robertpeston/rss.sxml"), + #("Blog: Stephanie Flanders (Economics Editor)", "http://feeds.bbci.co.uk/news/correspondents/stephanieflanders/rss.sxml"), + ("Blog: Rory Cellan-Jones (Technology correspondent)", "http://feeds.bbci.co.uk/news/correspondents/rorycellanjones/rss.sxml"), + ("Sport Front Page", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/front_page/rss.xml"), + #("Football", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/football/rss.xml"), + #("Cricket", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/cricket/rss.xml"), + #("Rugby Union", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/rugby_union/rss.xml"), + #("Rugby League", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/rugby_league/rss.xml"), + #("Tennis", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/tennis/rss.xml"), + #("Golf", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/golf/rss.xml"), + #("Motorsport", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/motorsport/rss.xml"), + #("Boxing", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/boxing/rss.xml"), + #("Athletics", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/athletics/rss.xml"), + #("Snooker", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/other_sports/snooker/rss.xml"), + #("Horse Racing", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/other_sports/horse_racing/rss.xml"), + #("Cycling", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/other_sports/cycling/rss.xml"), + #("Disability Sport", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/other_sports/disability_sport/rss.xml"), + #("Other Sport", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/other_sports/rss.xml"), + #("Olympics 2012", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/other_sports/olympics_2012/rss.xml"), + #("N. Ireland Politics", "http://feeds.bbci.co.uk/news/northern_ireland/northern_ireland_politics/rss.xml"), + #("Scotland Politics", "http://feeds.bbci.co.uk/news/scotland/scotland_politics/rss.xml"), + #("Scotland Business", "http://feeds.bbci.co.uk/news/scotland/scotland_business/rss.xml"), + #("E. Scotland, Edinburgh & Fife", "http://feeds.bbci.co.uk/news/scotland/edinburgh_east_and_fife/rss.xml"), + #("W. Scotland & Glasgow", "http://feeds.bbci.co.uk/news/scotland/glasgow_and_west/rss.xml"), + #("Highlands & Islands", "http://feeds.bbci.co.uk/news/scotland/highlands_and_islands/rss.xml"), + #("NE. Scotland, Orkney & Shetland", "http://feeds.bbci.co.uk/news/scotland/north_east_orkney_and_shetland/rss.xml"), + #("South Scotland", "http://feeds.bbci.co.uk/news/scotland/south_scotland/rss.xml"), + #("Central Scotland & Tayside", "http://feeds.bbci.co.uk/news/scotland/tayside_and_central/rss.xml"), + #("Wales Politics", "http://feeds.bbci.co.uk/news/wales/wales_politics/rss.xml"), + #("NW. Wales", "http://feeds.bbci.co.uk/news/wales/north_west_wales/rss.xml"), + #("NE. Wales", "http://feeds.bbci.co.uk/news/wales/north_east_wales/rss.xml"), + #("Mid. Wales", "http://feeds.bbci.co.uk/news/wales/mid_wales/rss.xml"), + #("SW. Wales", "http://feeds.bbci.co.uk/news/wales/south_west_wales/rss.xml"), + #("SE. Wales", "http://feeds.bbci.co.uk/news/wales/south_east_wales/rss.xml"), + #("Newyddion - News in Welsh", "http://feeds.bbci.co.uk/newyddion/rss.xml"), + #("Gwleidyddiaeth", "http://feeds.bbci.co.uk/newyddion/gwleidyddiaeth/rss.xml"), + #("Gogledd-Ddwyrain", "http://feeds.bbci.co.uk/newyddion/gogledd-ddwyrain/rss.xml"), + #("Gogledd-Orllewin", "http://feeds.bbci.co.uk/newyddion/gogledd-orllewin/rss.xml"), + #("Canolbarth", "http://feeds.bbci.co.uk/newyddion/canolbarth/rss.xml"), + #("De-Ddwyrain", "http://feeds.bbci.co.uk/newyddion/de-ddwyrain/rss.xml"), + #("De-Orllewin", "http://feeds.bbci.co.uk/newyddion/de-orllewin/rss.xml"), + ] + + + # **** SELECT YOUR USER PREFERENCES **** + + # Title to use for the ebook. + # + title = 'BBC News' + + # A brief description for the ebook. + # + description = u'BBC web site ebook created using rss feeds.' + + # The max number of articles which may be downloaded from each feed. + # I've never seen more than about 70 articles in a single feed in the + # BBC feeds. + # + max_articles_per_feed = 100 + + # The max age of articles which may be downloaded from each feed. This is + # specified in days - note fractions of days are allowed, Eg. 2.5 (2 and a + # half days). My default of 1.5 days is the last 36 hours, the point at + # which I've decided 'news' becomes 'old news', but be warned this is not + # so good for the blogs, technology, magazine, etc., and sports feeds. + # You may wish to extend this to 2-5 but watch out ebook creation time will + # increase as well. Setting this to 30 will get everything (AFAICT) as long + # as max_articles_per_feed remains set high (except for 'Click' which is + # v. low volume and its currently oldest article is 4th Feb 2011). + # + oldest_article = 1.5 + + # Number of simultaneous downloads. 20 is consistantly working fine on the + # BBC News feeds with no problems. Speeds things up from the defualt of 5. + # If you have a lot of feeds and/or have increased oldest_article above 2 + # then you may wish to try increasing simultaneous_downloads to 25-30, + # Or, of course, if you are in a hurry. [I've not tried beyond 20.] + # + simultaneous_downloads = 20 + + # Timeout for fetching files from the server in seconds. The default of + # 120 seconds, seems somewhat excessive. + # + timeout = 30 + + # The format string for the date shown on the ebook's first page. + # List of all values: http://docs.python.org/library/time.html + # Default in news.py has a leading space so that's mirrored here. + # As with 'feeds' select/de-select by adding/removing the initial '#', + # only one timefmt should be selected, here's a few to choose from. + # + timefmt = ' [%a, %d %b %Y]' # [Fri, 14 Nov 2011] (Calibre default) + #timefmt = ' [%a, %d %b %Y %H:%M]' # [Fri, 14 Nov 2011 18:30] + #timefmt = ' [%a, %d %b %Y %I:%M %p]' # [Fri, 14 Nov 2011 06:30 PM] + #timefmt = ' [%d %b %Y]' # [14 Nov 2011] + #timefmt = ' [%d %b %Y %H:%M]' # [14 Nov 2011 18.30] + #timefmt = ' [%Y-%m-%d]' # [2011-11-14] + #timefmt = ' [%Y-%m-%d-%H-%M]' # [2011-11-14-18-30] + + + + # + # **** IMPORTANT **** + # + # DO NOT EDIT BELOW HERE UNLESS YOU KNOW WHAT YOU ARE DOING. + # + # DO NOT EDIT BELOW HERE UNLESS YOU KNOW WHAT YOU ARE DOING. + # + # I MEAN IT, YES I DO, ABSOLUTELY, AT YOU OWN RISK. :) + # + # **** IMPORTANT **** + # + + + + # Author of this recipe. + __author__ = 'mattst' + + # Specify English as the language of the RSS feeds (ISO-639 code). + language = 'en_GB' + + # Set tags. + tags = 'news, sport, blog' + + # Set publisher and publication type. + publisher = 'BBC' + publication_type = 'newspaper' + + # Disable stylesheets from site. + no_stylesheets = True + + # Specifies an override encoding for sites that have an incorrect charset + # specified. Default of 'None' says to auto-detect. Some other BBC recipes + # use 'utf8', which works fine (so use that if necessary) but auto-detecting + # with None is working fine, so stick with that for robustness. + encoding = None + + # Sets whether a feed has full articles embedded in it. The BBC feeds do not. + use_embedded_content = False + + # Removes empty feeds - why keep them!? + remove_empty_feeds = True + + # Create a custom title which fits nicely in the Kindle title list. + # Requires "import time" above class declaration, and replacing + # title with custom_title in conversion_options (right column only). + # Example of string below: "BBC News - 14 Nov 2011" + # + # custom_title = "BBC News - " + time.strftime('%d %b %Y') + + ''' + # Conversion options for advanced users, but don't forget to comment out the + # current conversion_options below. Avoid setting 'linearize_tables' as that + # plays havoc with the 'old style' table based pages. + # + conversion_options = { 'title' : title, + 'comments' : description, + 'tags' : tags, + 'language' : language, + 'publisher' : publisher, + 'authors' : publisher, + 'smarten_punctuation' : True } + ''' - keep_only_tags = [ - dict(name='div', attrs={'class':['layout-block-a layout-block']}) - ,dict(attrs={'class':['story-body','storybody']}) - ] + conversion_options = { 'smarten_punctuation' : True } - remove_tags = [ - dict(name='div', attrs={'class':['story-feature related narrow', 'share-help', 'embedded-hyper', - 'story-feature wide ', 'story-feature narrow']}), - dict(id=['hypertab', 'comment-form']), - ] + # Specify extra CSS - overrides ALL other CSS (IE. Added last). + extra_css = 'body { font-family: verdana, helvetica, sans-serif; } \ + .introduction, .first { font-weight: bold; } \ + .cross-head { font-weight: bold; font-size: 125%; } \ + .cap, .caption { display: block; font-size: 80%; font-style: italic; } \ + .cap, .caption, .caption img, .caption span { display: block; text-align: center; margin: 5px auto; } \ + .byl, .byd, .byline img, .byline-name, .byline-title, .author-name, .author-position, \ + .correspondent-portrait img, .byline-lead-in, .name, .bbc-role { display: block; \ + text-align: center; font-size: 80%; font-style: italic; margin: 1px auto; } \ + .story-date, .published { font-size: 80%; } \ + table { width: 100%; } \ + td img { display: block; margin: 5px auto; } \ + ul { padding-top: 10px; } \ + ol { padding-top: 10px; } \ + li { padding-top: 5px; padding-bottom: 5px; } \ + h1 { text-align: center; font-size: 175%; font-weight: bold; } \ + h2 { text-align: center; font-size: 150%; font-weight: bold; } \ + h3 { text-align: center; font-size: 125%; font-weight: bold; } \ + h4, h5, h6 { text-align: center; font-size: 100%; font-weight: bold; }' - remove_attributes = ['width','height'] + # Remove various tag attributes to improve the look of the ebook pages. + remove_attributes = [ 'border', 'cellspacing', 'align', 'cellpadding', 'colspan', + 'valign', 'vspace', 'hspace', 'alt', 'width', 'height' ] - feeds = [ - ('News Front Page', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/front_page/rss.xml'), - ('Science/Nature', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/science/nature/rss.xml'), - ('Technology', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/technology/rss.xml'), - ('Entertainment', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/entertainment/rss.xml'), - ('Magazine', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/uk_news/magazine/rss.xml'), - ('Business', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/business/rss.xml'), - ('Health', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/health/rss.xml'), - ('Americas', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/americas/rss.xml'), - ('Europe', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/europe/rss.xml'), - ('South Asia', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/south_asia/rss.xml'), - ('UK', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/uk_news/rss.xml'), - ('Asia-Pacific', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/asia-pacific/rss.xml'), - ('Africa', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/africa/rss.xml'), - ] + # Remove the (admittedly rarely used) line breaks, "
", which sometimes + # cause a section of the ebook to start in an unsightly fashion or, more + # frequently, a "
" will muck up the formatting of a correspondant's byline. + # "
" and "
" are far more frequently used on the table formatted + # style of pages, and really spoil the look of the ebook pages. + preprocess_regexps = [(re.compile(r'', re.IGNORECASE), lambda m: ''), + (re.compile(r'', re.IGNORECASE), lambda m: '')] + + # Create regular expressions for tag keeping and removal to make the matches more + # robust against minor changes and errors in the HTML, Eg. double spaces, leading + # and trailing spaces, missing hyphens, and such like. + # Python regular expression ('re' class) page: http://docs.python.org/library/re.html + + # *************************************** + # Regular expressions for keep_only_tags: + # *************************************** + + # The BBC News HTML pages use variants of 'storybody' to denote the section of a HTML + # page which contains the main text of the article. Match storybody variants: 'storybody', + # 'story-body', 'story body','storybody ', etc. + storybody_reg_exp = '^.*story[_ -]*body.*$' + + # The BBC sport and 'newsbeat' (features) HTML pages use 'blq_content' to hold the title + # and published date. This is one level above the usual news pages which have the title + # and date within 'story-body'. This is annoying since 'blq_content' must also be kept, + # resulting in a lot of extra things to be removed by remove_tags. + blq_content_reg_exp = '^.*blq[_ -]*content.*$' + + # The BBC has an alternative page design structure, which I suspect is an out-of-date + # design but which is still used in some articles, Eg. 'Click' (technology), 'FastTrack' + # (travel), and in some sport pages. These alternative pages are table based (which is + # why I think they are an out-of-date design) and account for -I'm guesstimaking- less + # than 1% of all articles. They use a table class 'storycontent' to hold the article + # and like blq_content (above) have required lots of extra removal by remove_tags. + story_content_reg_exp = '^.*story[_ -]*content.*$' + + # Keep the sections of the HTML which match the list below. The HTML page created by + # Calibre will fill with those sections which are matched. Note that the + # blq_content_reg_exp must be listed before storybody_reg_exp in keep_only_tags due to + # it being the parent of storybody_reg_exp, that is to say the div class/id 'story-body' + # will be inside div class/id 'blq_content' in the HTML (if 'blq_content' is there at + # all). If they are the other way around in keep_only_tags then blq_content_reg_exp + # will end up being discarded. + keep_only_tags = [ dict(name='table', attrs={'class':re.compile(story_content_reg_exp, re.IGNORECASE)}), + dict(name='div', attrs={'class':re.compile(blq_content_reg_exp, re.IGNORECASE)}), + dict(name='div', attrs={'id':re.compile(blq_content_reg_exp, re.IGNORECASE)}), + dict(name='div', attrs={'class':re.compile(storybody_reg_exp, re.IGNORECASE)}), + dict(name='div', attrs={'id':re.compile(storybody_reg_exp, re.IGNORECASE)}) ] + + # ************************************ + # Regular expressions for remove_tags: + # ************************************ + + # Regular expression to remove share-help and variant tags. The share-help class + # is used by the site for a variety of 'sharing' type links, Eg. Facebook, delicious, + # twitter, email. Removed to avoid page clutter. + share_help_reg_exp = '^.*share[_ -]*help.*$' + + # Regular expression to remove embedded-hyper and variant tags. This class is used to + # display links to other BBC News articles on the same/similar subject. + embedded_hyper_reg_exp = '^.*embed*ed[_ -]*hyper.*$' + + # Regular expression to remove hypertabs and variant tags. This class is used to + # display a tab bar at the top of an article which allows the user to switch to + # an article (viewed on the same page) providing further info., 'in depth' analysis, + # an editorial, a correspondant's blog entry, and such like. The ability to handle + # a tab bar of this nature is currently beyond the scope of this recipe and + # possibly of Calibre itself (not sure about that - TO DO - check!). + hypertabs_reg_exp = '^.*hyper[_ -]*tabs.*$' + + # Regular expression to remove story-feature and variant tags. Eg. 'story-feature', + # 'story-feature related narrow', 'story-feature wide', 'story-feature narrow'. + # This class is used to add additional info. boxes, or small lists, outside of + # the main story. TO DO: Work out a way to incorporate these neatly. + story_feature_reg_exp = '^.*story[_ -]*feature.*$' + + # Regular expression to remove video and variant tags, Eg. 'videoInStoryB', + # 'videoInStoryC'. This class is used to embed video. + video_reg_exp = '^.*video.*$' + + # Regular expression to remove audio and variant tags, Eg. 'audioInStoryD'. + # This class is used to embed audio. + audio_reg_exp = '^.*audio.*$' + + # Regular expression to remove pictureGallery and variant tags, Eg. 'pictureGallery'. + # This class is used to embed a photo slideshow. See also 'slideshow' below. + picture_gallery_reg_exp = '^.*picture.*$' + + # Regular expression to remove slideshow and variant tags, Eg. 'dslideshow-enclosure'. + # This class is used to embed a slideshow (not necessarily photo) but both + # 'slideshow' and 'pictureGallery' are used for slideshows. + slideshow_reg_exp = '^.*slide[_ -]*show.*$' + + # Regular expression to remove social-links and variant tags. This class is used to + # display links to a BBC bloggers main page, used in various columnist's blogs + # (Eg. Nick Robinson, Robert Preston). + social_links_reg_exp = '^.*social[_ -]*links.*$' + + # Regular expression to remove quote and (multi) variant tags, Eg. 'quote', + # 'endquote', 'quote-credit', 'quote-credit-title', etc. These are usually + # removed by 'story-feature' removal (as they are usually within them), but + # not always. The quotation removed is always (AFAICT) in the article text + # as well but a 2nd copy is placed in a quote tag to draw attention to it. + # The quote class tags may or may not appear in div's. + quote_reg_exp = '^.*quote.*$' + + # Regular expression to remove hidden and variant tags, Eg. 'hidden'. + # The purpose of these is unclear, they seem to be an internal link to a + # section within the article, but the text of the link (Eg. 'Continue reading + # the main story') never seems to be displayed anyway. Removed to avoid clutter. + # The hidden class tags may or may not appear in div's. + hidden_reg_exp = '^.*hidden.*$' + + # Regular expression to remove comment and variant tags, Eg. 'comment-introduction'. + # Used on the site to display text about registered users entering comments. + comment_reg_exp = '^.*comment.*$' + + # Regular expression to remove form and variant tags, Eg. 'comment-form'. + # Used on the site to allow registered BBC users to fill in forms, typically + # for entering comments about an article. + form_reg_exp = '^.*form.*$' + + # Extra things to remove due to the addition of 'blq_content' in keep_only_tags. + + #
Used on sports pages for 'email' and 'print'. + story_actions_reg_exp = '^.*story[_ -]*actions.*$' + + #
Used on sports pages instead of 'share-help' (for + # social networking links). + bookmark_list_reg_exp = '^.*bookmark[_ -]*list.*$' + + #
+ # NOTE: Don't remove class="content-group" that is needed. + # Used on sports pages to link to 'similar stories'. + secondary_content_reg_exp = '^.*secondary[_ -]*content.*$' + + #