diff --git a/Changelog.yaml b/Changelog.yaml index d3032623a0..39bfd0ef10 100644 --- a/Changelog.yaml +++ b/Changelog.yaml @@ -19,6 +19,181 @@ # new recipes: # - title: +- version: 0.8.58 + date: 2012-06-29 + + new features: + - title: "Add some texture to calibre generated covers" + + - title: "Drivers for Sogo SS-4370, HTC G2 and Lenovo ThinkPad Tablet" + tickets: [1019050, 1017010] + + - title: "Add search to the Manage tags/series/etc. dialogs" + + - title: "News download: Add support for images embedded in the HTML" + + - title: "calibre -s now waits for calibre to shutdown" + + bug fixes: + - title: "Workaround for iTunes breaking scripting with version 10.6.3 on OS X." + tickets: [1012243] + + - title: "EPUB Input: When there are multiple elements of the same type in the OPF guide, use the first rather than the last element." + + - title: "Windows: Disable the new UI style if the color depth of the desktop is less than 32 bits per pixel" + + - title: "ISBNDB metadata plugin: Return results even though they have no comments" + + - title: "More robust handling of EINTR during IPC" + + - title: "Metadata download: Support for amazon's new results page markup" + + - title: "EPUB Output: Fix a bug that could cause corrupted output when doing an EPUB/OEB to EPUB conversion if the input EPUB had multiple files with the same name" + + - title: "KF8 Output: Fix a couple of bugs that could lead to generation of invalid KF8 files." + tickets: [1016672] + + improved recipes: + - ABC Digital + - O Globo + + new recipes: + - title: Sign of the Times and New Statesman + author: TerminalVeracity + + - title: CT24 + author: zoidozoido + + - title: SmileZilla + author: Will + + - title: Marketing Sensoriale + author: NotTaken + +- version: 0.8.57 + date: 2012-06-22 + + new features: + - title: "PDF Output: Full pagination support. No more cutoff bottom line." + type: major + description: "Fixes a long standing bug in calibre's PDF Output that caused the bottom line of some pages to be partially cut off and prevented top and bottom margins from working." + + - title: "calibredb add now prints out the ids of added books" + tickets: [1014303] + + - title: "Kobo Vox driver: Add support for new Google Play firmware" + tickets: [1014129] + + - title: "Driver for Prestigio PMP5097PRO" + tickets: [1013864] + + - title: "Add option to disable tooltips in the book list under Preferences->Look & Feel" + + - title: "When customizing builtin recipes download the latest version of the recipe to customize instead of using the possibly out of date bundled version" + + bug fixes: + - title: "PDF Output: Use the cover from the input document when no cover is specified during a conversion" + + - title: "E-book Viewer: Printing now has proper pagination with top and bottom margins no lines partially cut-off at the bottom and full style retention" + + - title: "KF8 Input: Handle files with incorrectly encoded guide type entries." + tickets: [1015020] + + - title: "E-book viewer: Disable hyphenation on windows xp as Qt WebKit barfs on soft hyphens on windows XP" + + - title: "Handle OS X systems with invalid palette colors." + tickets: [1014900] + + - title: "Tag Browser: Fix regression that broke partitioning of hierarchical categories." + tickets: [1014065] + + - title: "LRF Output: Handle negative page margins" + tickets: [1014103] + + - title: "Template language: Fix arithmetic functions to tolerate the value 'None' as returned by raw_field()" + + - title: "Fix custom title sort set in the edit metadata dialog getting reset by the conversion dialog" + + improved recipes: + - The Economist + - Akter + - 24 Sata sr + - Novi List + - Metro Montreal + - Mode Durable + - CanardPC + - The Economic Collapse + - Our Daily Bread + + new recipes: + - title: Akter Daily + author: Darko MIletic + + - title: BBC Brasil + author: Claviola + + - title: Homopedia.pl + author: rainbowwarrior + + - title: National Geographic Magazine + author: Terminal Veracity + + - title: Something Awful + author: atordo + + - title: Huffington Post UK + author: Krittika Goyal + +- version: 0.8.56 + date: 2012-06-15 + + new features: + - title: "Make the new calibre style default on Windows and OS X." + type: major + description: "This change gives a more 'modern' feel to the calibre user interface with focus highlighting, gradients, rounded corners, etc. In case you prefer the old look, you can restore under Preferences->Look & Feel->User interface style" + + - title: "Get Books: Add the new SONY Reader store" + + - title: "Read metadata from .docx (Microsoft Word) files" + + - title: "Allow customizing the behavior of the searching for similar books by right clicking the book. You can now tell calibre to search different columns than the traditional author/series/publisher/tags/etc. in Preferences->Searching" + + - title: "Add option to restore alternating row colors to the Tag Browser under Preferences->Look & Feel->Tag Browser" + + - title: "Update to Qt 4.8.2 on windows compiled with link time code generation for a small performance boost" + + bug fixes: + - title: "Get Books: Update plugins to handle website changes at ebooks.com, project gutenberg, and virtualo" + + - title: "AZW3 Output: Fix TOC at start option not working" + + - title: "AZW3 Output: Close self closing script/style/title/head tags explicitly as they cause problems in webkit based renderers like the Kindle Fire and calibre's viewers." + + - title: "Fix the current_library_name() template function not updating after a library switch" + + - title: "AZW3 Output: Handle the case of a link pointing to the last line of text in the document." + tickets: [1011330] + + - title: "Fix regression in 0.8.55 that broke highlighting of items matching a search in the Tag Browser" + tickets: [1011030] + + - title: "News download: Handle query only relative URLs" + + improved recipes: + - Christian Science Monitor + - Neue Zurcher Zeitung + - Birmignham Post + - Metro UK + - New Musical Express + - The Independent + - The Daily Mirror + - Vreme + - Smithsonian Magazine + + new recipes: + - title: NZZ Webpaper + author: Bernd Leinfelder + - version: 0.8.55 date: 2012-06-08 diff --git a/manual/creating_plugins.rst b/manual/creating_plugins.rst index d38abfd341..c3f1202365 100644 --- a/manual/creating_plugins.rst +++ b/manual/creating_plugins.rst @@ -172,7 +172,7 @@ You can see the ``prefs`` object being used in main.py: :pyobject: DemoDialog.config -The different types of plugins +The plugin API -------------------------------- As you may have noticed above, a plugin in |app| is a class. There are different classes for the different types of plugins in |app|. @@ -195,7 +195,7 @@ It can get tiresome to keep re-adding a plugin to calibre to test small changes. Once you've located the zip file of your plugin you can then directly update it with your changes instead of re-adding it each time. To do so from the command line, in the directory that contains your plugin source code, use:: - calibre -s; sleep 4s; zip -R /path/to/plugin/zip/file.zip *; calibre + calibre -s; zip -r /path/to/plugin/zip/file.zip *; calibre This will shutdown a running calibre. Wait for the shutdown to complete, then update your plugin files and relaunch calibre. It relies on the freely available zip command line tool. diff --git a/recipes/24sata_rs.recipe b/recipes/24sata_rs.recipe index 0f879036ea..a51323f21f 100644 --- a/recipes/24sata_rs.recipe +++ b/recipes/24sata_rs.recipe @@ -1,6 +1,7 @@ +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai __license__ = 'GPL v3' -__copyright__ = '2009-2010, Darko Miletic ' +__copyright__ = '2009-2012, Darko Miletic ' ''' 24sata.rs @@ -21,26 +22,29 @@ class Ser24Sata(BasicNewsRecipe): encoding = 'utf-8' use_embedded_content = False language = 'sr' - publication_type = 'newspaper' - extra_css = '@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} body{font-family: serif1, serif} .article_description{font-family: serif1, serif}' + publication_type = 'newsportal' + extra_css = """ + @font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} + body{font-family: serif1, serif} + """ conversion_options = { - 'comment' : description - , 'tags' : category - , 'publisher' : publisher - , 'language' : language - , 'linearize_tables' : True + 'comment' : description + , 'tags' : category + , 'publisher': publisher + , 'language' : language } preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] - feeds = [(u'Vesti Dana', u'http://www.24sata.rs/rss.php')] - - def preprocess_html(self, soup): - return self.adeify_images(soup) + feeds = [ + (u'Vesti' , u'http://www.24sata.rs/rss/vesti.xml' ), + (u'Sport' , u'http://www.24sata.rs/rss/sport.xml' ), + (u'Šou' , u'http://www.24sata.rs/rss/sou.xml' ), + (u'Specijal', u'http://www.24sata.rs/rss/specijal.xml'), + (u'Novi Sad', u'http://www.24sata.rs/rss/ns.xml' ) + ] def print_version(self, url): - article = url.partition('#')[0] - article_id = article.partition('id=')[2] - return 'http://www.24sata.rs/_print.php?id=' + article_id - + dpart, spart, apart = url.rpartition('/') + return dpart + '/print/' + apart diff --git a/recipes/abc_py.recipe b/recipes/abc_py.recipe index 297129d269..41005c6844 100644 --- a/recipes/abc_py.recipe +++ b/recipes/abc_py.recipe @@ -1,5 +1,5 @@ __license__ = 'GPL v3' -__copyright__ = '2010, Darko Miletic ' +__copyright__ = '2010-2012, Darko Miletic ' ''' abc.com.py ''' @@ -7,7 +7,7 @@ abc.com.py from calibre.web.feeds.news import BasicNewsRecipe class ABC_py(BasicNewsRecipe): - title = 'ABC digital' + title = 'ABC Color' __author__ = 'Darko Miletic' description = 'Noticias de Paraguay y el resto del mundo' publisher = 'ABC' @@ -15,12 +15,16 @@ class ABC_py(BasicNewsRecipe): oldest_article = 2 max_articles_per_feed = 200 no_stylesheets = True - encoding = 'cp1252' + encoding = 'utf8' use_embedded_content = False language = 'es_PY' remove_empty_feeds = True + masthead_url = 'http://www.abc.com.py/plantillas/img/abc-logo.png' publication_type = 'newspaper' - extra_css = ' body{font-family: Arial,Helvetica,sans-serif } img{margin-bottom: 0.4em} ' + extra_css = """ + body{font-family: UnitSlabProMedium,"Times New Roman",serif } + img{margin-bottom: 0.4em; display: block;} + """ conversion_options = { 'comment' : description @@ -29,21 +33,19 @@ class ABC_py(BasicNewsRecipe): , 'language' : language } - remove_tags = [dict(name=['form','iframe','embed','object','link','base','table']),dict(attrs={'class':'toolbox'})] - remove_tags_after = dict(attrs={'class':'date'}) - keep_only_tags = [dict(attrs={'class':'zcontent'})] + remove_tags = [ + dict(name=['form','iframe','embed','object','link','base','table']), + dict(attrs={'class':['es-carousel-wrapper']}), + dict(attrs={'id':['tools','article-banner-1']}) + ] + keep_only_tags = [dict(attrs={'id':'article'})] feeds = [ - (u'Ultimo momento' , u'http://www.abc.com.py/ultimo-momento.xml' ) - ,(u'Nacionales' , u'http://www.abc.com.py/nacionales.xml' ) - ,(u'Internacionales' , u'http://www.abc.com.py/internacionales.xml' ) - ,(u'Deportes' , u'http://www.abc.com.py/deportes.xml' ) - ,(u'Espectaculos' , u'http://www.abc.com.py/espectaculos.xml' ) - ,(u'Ciencia y Tecnologia', u'http://www.abc.com.py/ciencia-y-tecnologia.xml') + (u'Ultimo momento', u'http://www.abc.com.py/rss.xml' ) + ,(u'Nacionales' , u'http://www.abc.com.py/nacionales/rss.xml' ) + ,(u'Mundo' , u'http://www.abc.com.py/internacionales/rss.xml') + ,(u'Deportes' , u'http://www.abc.com.py/deportes/rss.xml' ) + ,(u'Espectaculos' , u'http://www.abc.com.py/espectaculos/rss.xml' ) + ,(u'TecnoCiencia' , u'http://www.abc.com.py/ciencia/rss.xml' ) ] - - def preprocess_html(self, soup): - for item in soup.findAll(style=True): - del item['style'] - return soup diff --git a/recipes/akter.recipe b/recipes/akter.recipe index 0f2fb05640..83625c240b 100644 --- a/recipes/akter.recipe +++ b/recipes/akter.recipe @@ -1,5 +1,5 @@ __license__ = 'GPL v3' -__copyright__ = '2010, Darko Miletic ' +__copyright__ = '2010-2012, Darko Miletic ' ''' akter.co.rs ''' @@ -8,7 +8,7 @@ import re from calibre.web.feeds.news import BasicNewsRecipe class Akter(BasicNewsRecipe): - title = 'AKTER' + title = 'AKTER - Nedeljnik' __author__ = 'Darko Miletic' description = 'AKTER - nedeljni politicki magazin savremene Srbije' publisher = 'Akter Media Group d.o.o.' @@ -18,61 +18,37 @@ class Akter(BasicNewsRecipe): no_stylesheets = True use_embedded_content = False encoding = 'utf-8' - masthead_url = 'http://www.akter.co.rs/templates/gk_thenews2/images/style2/logo.png' + masthead_url = 'http://www.akter.co.rs/gfx/logoneover.png' language = 'sr' publication_type = 'magazine' remove_empty_feeds = True - PREFIX = 'http://www.akter.co.rs' extra_css = """ @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)} - .article_description,body{font-family: Arial,Helvetica,sans1,sans-serif} - .color-2{display:block; margin-bottom: 10px; padding: 5px, 10px; - border-left: 1px solid #D00000; color: #D00000} - img{margin-bottom: 0.8em} """ + body{font-family: Tahoma,Geneva,sans1,sans-serif} + img{margin-bottom: 0.8em; display: block;} + """ conversion_options = { - 'comment' : description - , 'tags' : category - , 'publisher' : publisher - , 'language' : language - , 'linearize_tables' : True + 'comment' : description + , 'tags' : category + , 'publisher': publisher + , 'language' : language } preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] - - feeds = [ - (u'Politika' , u'http://www.akter.co.rs/index.php/politikaprint.html' ) - ,(u'Ekonomija' , u'http://www.akter.co.rs/index.php/ekonomijaprint.html') - ,(u'Life&Style' , u'http://www.akter.co.rs/index.php/lsprint.html' ) - ,(u'Sport' , u'http://www.akter.co.rs/index.php/sportprint.html' ) - ] - - def preprocess_html(self, soup): - for item in soup.findAll(style=True): - del item['style'] - return self.adeify_images(soup) + keep_only_tags = [dict(name='div', attrs={'id':'section_to_print'})] + feeds = [(u'Nedeljnik', u'http://akter.co.rs/rss/nedeljnik')] def print_version(self, url): - return url + '?tmpl=component&print=1&page=' - - def parse_index(self): - totalfeeds = [] - lfeeds = self.get_feeds() - for feedobj in lfeeds: - feedtitle, feedurl = feedobj - self.report_progress(0, _('Fetching feed')+' %s...'%(feedtitle if feedtitle else feedurl)) - articles = [] - soup = self.index_to_soup(feedurl) - for item in soup.findAll(attrs={'class':['sectiontableentry1','sectiontableentry2']}): - link = item.find('a') - url = self.PREFIX + link['href'] - title = self.tag_to_string(link) - articles.append({ - 'title' :title - ,'date' :'' - ,'url' :url - ,'description':'' - }) - totalfeeds.append((feedtitle, articles)) - return totalfeeds + dpart, spart, apart = url.rpartition('/') + return dpart + '/print-' + apart + def get_cover_url(self): + soup = self.index_to_soup('http://www.akter.co.rs/weekly.html') + divt = soup.find('div', attrs={'class':'lastissue'}) + if divt: + imgt = divt.find('img') + if imgt: + return 'http://www.akter.co.rs' + imgt['src'] + return None + diff --git a/recipes/akter_dnevnik.recipe b/recipes/akter_dnevnik.recipe new file mode 100644 index 0000000000..7322baf4ec --- /dev/null +++ b/recipes/akter_dnevnik.recipe @@ -0,0 +1,44 @@ +__license__ = 'GPL v3' +__copyright__ = '2012, Darko Miletic ' +''' +akter.co.rs +''' + +import re +from calibre.web.feeds.news import BasicNewsRecipe + +class Akter(BasicNewsRecipe): + title = 'AKTER - Dnevnik' + __author__ = 'Darko Miletic' + description = 'AKTER - Najnovije vesti iz Srbije' + publisher = 'Akter Media Group d.o.o.' + category = 'vesti, online vesti, najnovije vesti, politika, sport, ekonomija, biznis, finansije, berza, kultura, zivot, putovanja, auto, automobili, tehnologija, politicki magazin, dogadjaji, desavanja, lifestyle, zdravlje, zdravstvo, vest, novine, nedeljnik, srbija, novi sad, vojvodina, svet, drustvo, zabava, republika srpska, beograd, intervju, komentar, reportaza, arhiva vesti, news, serbia, politics' + oldest_article = 8 + max_articles_per_feed = 100 + no_stylesheets = True + use_embedded_content = False + encoding = 'utf-8' + masthead_url = 'http://www.akter.co.rs/gfx/logodnover.png' + language = 'sr' + publication_type = 'magazine' + remove_empty_feeds = True + extra_css = """ + @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)} + body{font-family: Tahoma,Geneva,sans1,sans-serif} + img{margin-bottom: 0.8em; display: block;} + """ + + conversion_options = { + 'comment' : description + , 'tags' : category + , 'publisher': publisher + , 'language' : language + } + + preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] + keep_only_tags = [dict(name='div', attrs={'id':'section_to_print'})] + feeds = [(u'Vesti', u'http://akter.co.rs/rss/dnevni')] + + def print_version(self, url): + dpart, spart, apart = url.rpartition('/') + return dpart + '/print-' + apart diff --git a/recipes/bbc_brasil.recipe b/recipes/bbc_brasil.recipe new file mode 100644 index 0000000000..a2d83944d1 --- /dev/null +++ b/recipes/bbc_brasil.recipe @@ -0,0 +1,594 @@ +## +## Title: BBC News, Sport, and Blog Calibre Recipe +## Contact: mattst - jmstanfield@gmail.com +## +## License: GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html +## Copyright: mattst - jmstanfield@gmail.com +## +## Written: November 2011 +## Last Edited: 2011-11-19 +## + +__license__ = 'GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html' +__copyright__ = 'mattst - jmstanfield@gmail.com' + + +''' +BBC News, Sport, and Blog Calibre Recipe +''' + +# Import the regular expressions module. +import re + +# Import the BasicNewsRecipe class which this class extends. +from calibre.web.feeds.recipes import BasicNewsRecipe + +class BBCBrasilRecipe(BasicNewsRecipe): + + # + # **** IMPORTANT USERS READ ME **** + # + # First select the feeds you want then scroll down below the feeds list + # and select the values you want for the other user preferences, like + # oldest_article and such like. + # + # + # Select the BBC rss feeds which you want in your ebook. + # Selected feed have NO '#' at their start, de-selected feeds begin with a '#'. + # + # Eg. ("News Home", "http://feeds.bbci.co.uk/... - include feed. + # Eg. #("News Home", "http://feeds.bbci.co.uk/... - do not include feed. + # + # There are 68 feeds below which constitute the bulk of the available rss + # feeds on the BBC web site. These include 5 blogs by editors and + # correspondants, 16 sports feeds, 15 'sub' regional feeds (Eg. North West + # Wales, Scotland Business), and 7 Welsh language feeds. + # + # Some of the feeds are low volume (Eg. blogs), or very low volume (Eg. Click) + # so if "oldest_article = 1.5" (only articles published in the last 36 hours) + # you may get some 'empty feeds' which will not then be included in the ebook. + # + # The 15 feeds currently selected below are simply my default ones. + # + # Note: With all 68 feeds selected, oldest_article set to 2, + # max_articles_per_feed set to 100, and simultaneous_downloads set to 10, + # the ebook creation took 29 minutes on my speedy 100 mbps net connection, + # fairly high-end desktop PC running Linux (Ubuntu Lucid-Lynx). + # More realistically with 15 feeds selected, oldest_article set to 1.5, + # max_articles_per_feed set to 100, and simultaneous_downloads set to 20, + # it took 6 minutes. If that's too slow increase 'simultaneous_downloads'. + # + # Select / de-select the feeds you want in your ebook. + # + feeds = [ + (u'Primeira P\xe1gina', u'http://www.bbc.co.uk/portuguese/index.xml'), + (u'\xdaltimas Not\xedcias', u'http://www.bbc.co.uk/portuguese/ultimas_noticias/index.xml'), + (u'Internacional', u'http://www.bbc.co.uk/portuguese/topicos/internacional/index.xml'), + (u'Brasil', u'http://www.bbc.co.uk/portuguese/topicos/brasil/index.xml'), + (u'Am\xe9rica Latina', u'http://www.bbc.co.uk/portuguese/topicos/america_latina/index.xml'), + (u'Economia', u'http://www.bbc.co.uk/portuguese/topicos/economia/index.xml'), + (u'Sa\xfade', u'http://www.bbc.co.uk/portuguese/topicos/saude/index.xml'), + (u'Ci\xeancia e Tecnologia', u'http://www.bbc.co.uk/portuguese/topicos/ciencia_e_tecnologia/index.xml'), + (u'Cultura', u'http://www.bbc.co.uk/portuguese/topicos/cultura/index.xml'), + (u'V\xeddeos e Fotos', u'http://www.bbc.co.uk/portuguese/videos_e_fotos/index.xml'), + (u'Especiais', u'http://www.bbc.co.uk/portuguese/especiais/index.xml') + ] + + + # **** SELECT YOUR USER PREFERENCES **** + + # Title to use for the ebook. + # + title = 'BBC Brasil' + + # A brief description for the ebook. + # + description = u'Not\xedcias do Brasil e do mundo pela British Broadcasting Corporation' + + # The max number of articles which may be downloaded from each feed. + # I've never seen more than about 70 articles in a single feed in the + # BBC feeds. + # + max_articles_per_feed = 100 + + # The max age of articles which may be downloaded from each feed. This is + # specified in days - note fractions of days are allowed, Eg. 2.5 (2 and a + # half days). My default of 1.5 days is the last 36 hours, the point at + # which I've decided 'news' becomes 'old news', but be warned this is not + # so good for the blogs, technology, magazine, etc., and sports feeds. + # You may wish to extend this to 2-5 but watch out ebook creation time will + # increase as well. Setting this to 30 will get everything (AFAICT) as long + # as max_articles_per_feed remains set high (except for 'Click' which is + # v. low volume and its currently oldest article is 4th Feb 2011). + # + oldest_article = 1.5 + + # Number of simultaneous downloads. 20 is consistantly working fine on the + # BBC News feeds with no problems. Speeds things up from the defualt of 5. + # If you have a lot of feeds and/or have increased oldest_article above 2 + # then you may wish to try increasing simultaneous_downloads to 25-30, + # Or, of course, if you are in a hurry. [I've not tried beyond 20.] + # + simultaneous_downloads = 20 + + # Timeout for fetching files from the server in seconds. The default of + # 120 seconds, seems somewhat excessive. + # + timeout = 30 + + # The format string for the date shown on the ebook's first page. + # List of all values: http://docs.python.org/library/time.html + # Default in news.py has a leading space so that's mirrored here. + # As with 'feeds' select/de-select by adding/removing the initial '#', + # only one timefmt should be selected, here's a few to choose from. + # + timefmt = ' [%a, %d %b %Y]' # [Fri, 14 Nov 2011] (Calibre default) + #timefmt = ' [%a, %d %b %Y %H:%M]' # [Fri, 14 Nov 2011 18:30] + #timefmt = ' [%a, %d %b %Y %I:%M %p]' # [Fri, 14 Nov 2011 06:30 PM] + #timefmt = ' [%d %b %Y]' # [14 Nov 2011] + #timefmt = ' [%d %b %Y %H:%M]' # [14 Nov 2011 18.30] + #timefmt = ' [%Y-%m-%d]' # [2011-11-14] + #timefmt = ' [%Y-%m-%d-%H-%M]' # [2011-11-14-18-30] + + + + # + # **** IMPORTANT **** + # + # DO NOT EDIT BELOW HERE UNLESS YOU KNOW WHAT YOU ARE DOING. + # + # DO NOT EDIT BELOW HERE UNLESS YOU KNOW WHAT YOU ARE DOING. + # + # I MEAN IT, YES I DO, ABSOLUTELY, AT YOU OWN RISK. :) + # + # **** IMPORTANT **** + # + + + + # Author of this recipe. + __author__ = 'Carlos Laviola' + + language = 'pt_BR' + + # Set tags. + tags = 'news, sport, blog' + + # Set publisher and publication type. + publisher = 'BBC' + publication_type = 'newspaper' + + # Disable stylesheets from site. + no_stylesheets = True + + # Specifies an override encoding for sites that have an incorrect charset + # specified. Default of 'None' says to auto-detect. Some other BBC recipes + # use 'utf8', which works fine (so use that if necessary) but auto-detecting + # with None is working fine, so stick with that for robustness. + encoding = None + + # Sets whether a feed has full articles embedded in it. The BBC feeds do not. + use_embedded_content = False + + # Removes empty feeds - why keep them!? + remove_empty_feeds = True + + # Create a custom title which fits nicely in the Kindle title list. + # Requires "import time" above class declaration, and replacing + # title with custom_title in conversion_options (right column only). + # Example of string below: "BBC News - 14 Nov 2011" + # + # custom_title = "BBC News - " + time.strftime('%d %b %Y') + + ''' + # Conversion options for advanced users, but don't forget to comment out the + # current conversion_options below. Avoid setting 'linearize_tables' as that + # plays havoc with the 'old style' table based pages. + # + conversion_options = { 'title' : title, + 'comments' : description, + 'tags' : tags, + 'language' : language, + 'publisher' : publisher, + 'authors' : publisher, + 'smarten_punctuation' : True + } + ''' + + conversion_options = { 'smarten_punctuation' : True } + + # Specify extra CSS - overrides ALL other CSS (IE. Added last). + extra_css = 'body { font-family: verdana, helvetica, sans-serif; } \ + .introduction, .first { font-weight: bold; } \ + .cross-head { font-weight: bold; font-size: 125%; } \ + .cap, .caption { display: block; font-size: 80%; font-style: italic; } \ + .cap, .caption, .caption img, .caption span { display: block; text-align: center; margin: 5px auto; } \ + .byl, .byd, .byline img, .byline-name, .byline-title, .author-name, .author-position, \ + .correspondent-portrait img, .byline-lead-in, .name, .role, .bbc-role { display: block; \ + text-align: center; font-size: 80%; font-style: italic; margin: 1px auto; } \ + .story-date, .published, .datestamp { font-size: 80%; } \ + table { width: 100%; } \ + td img { display: block; margin: 5px auto; } \ + ul { padding-top: 10px; } \ + ol { padding-top: 10px; } \ + li { padding-top: 5px; padding-bottom: 5px; } \ + h1 { text-align: center; font-size: 175%; font-weight: bold; } \ + h2 { text-align: center; font-size: 150%; font-weight: bold; } \ + h3 { text-align: center; font-size: 125%; font-weight: bold; } \ + h4, h5, h6 { text-align: center; font-size: 100%; font-weight: bold; }' + + # Remove various tag attributes to improve the look of the ebook pages. + remove_attributes = [ 'border', 'cellspacing', 'align', 'cellpadding', 'colspan', + 'valign', 'vspace', 'hspace', 'alt', 'width', 'height' ] + + # Remove the (admittedly rarely used) line breaks, "
", which sometimes + # cause a section of the ebook to start in an unsightly fashion or, more + # frequently, a "
" will muck up the formatting of a correspondant's byline. + # "
" and "
" are far more frequently used on the table formatted + # style of pages, and really spoil the look of the ebook pages. + preprocess_regexps = [(re.compile(r'', re.IGNORECASE), lambda m: ''), + (re.compile(r'', re.IGNORECASE), lambda m: '')] + + + # Create regular expressions for tag keeping and removal to make the matches more + # robust against minor changes and errors in the HTML, Eg. double spaces, leading + # and trailing spaces, missing hyphens, and such like. + # Python regular expression ('re' class) page: http://docs.python.org/library/re.html + + # *************************************** + # Regular expressions for keep_only_tags: + # *************************************** + + # The BBC News HTML pages use variants of 'storybody' to denote the section of a HTML + # page which contains the main text of the article. Match storybody variants: 'storybody', + # 'story-body', 'story body','storybody ', etc. + storybody_reg_exp = '^.*story[_ -]*body.*$' + + # The BBC sport and 'newsbeat' (features) HTML pages use 'blq_content' to hold the title + # and published date. This is one level above the usual news pages which have the title + # and date within 'story-body'. This is annoying since 'blq_content' must also be kept, + # resulting in a lot of extra things to be removed by remove_tags. + blq_content_reg_exp = '^.*blq[_ -]*content.*$' + + # The BBC has an alternative page design structure, which I suspect is an out-of-date + # design but which is still used in some articles, Eg. 'Click' (technology), 'FastTrack' + # (travel), and in some sport pages. These alternative pages are table based (which is + # why I think they are an out-of-date design) and account for -I'm guesstimaking- less + # than 1% of all articles. They use a table class 'storycontent' to hold the article + # and like blq_content (above) have required lots of extra removal by remove_tags. + story_content_reg_exp = '^.*story[_ -]*content.*$' + + # Keep the sections of the HTML which match the list below. The HTML page created by + # Calibre will fill with those sections which are matched. Note that the + # blq_content_reg_exp must be listed before storybody_reg_exp in keep_only_tags due to + # it being the parent of storybody_reg_exp, that is to say the div class/id 'story-body' + # will be inside div class/id 'blq_content' in the HTML (if 'blq_content' is there at + # all). If they are the other way around in keep_only_tags then blq_content_reg_exp + # will end up being discarded. + keep_only_tags = [ dict(name='table', attrs={'class':re.compile(story_content_reg_exp, re.IGNORECASE)}), + dict(name='div', attrs={'class':re.compile(blq_content_reg_exp, re.IGNORECASE)}), + dict(name='div', attrs={'id':re.compile(blq_content_reg_exp, re.IGNORECASE)}), + dict(name='div', attrs={'class':re.compile(storybody_reg_exp, re.IGNORECASE)}), + dict(name='div', attrs={'id':re.compile(storybody_reg_exp, re.IGNORECASE)}) ] + + # ************************************ + # Regular expressions for remove_tags: + # ************************************ + + # Regular expression to remove share-help and variant tags. The share-help class + # is used by the site for a variety of 'sharing' type links, Eg. Facebook, delicious, + # twitter, email. Removed to avoid page clutter. + share_help_reg_exp = '^.*share[_ -]*help.*$' + + # Regular expression to remove embedded-hyper and variant tags. This class is used to + # display links to other BBC News articles on the same/similar subject. + embedded_hyper_reg_exp = '^.*embed*ed[_ -]*hyper.*$' + + # Regular expression to remove hypertabs and variant tags. This class is used to + # display a tab bar at the top of an article which allows the user to switch to + # an article (viewed on the same page) providing further info., 'in depth' analysis, + # an editorial, a correspondant's blog entry, and such like. The ability to handle + # a tab bar of this nature is currently beyond the scope of this recipe and + # possibly of Calibre itself (not sure about that - TO DO - check!). + hypertabs_reg_exp = '^.*hyper[_ -]*tabs.*$' + + # Regular expression to remove story-feature and variant tags. Eg. 'story-feature', + # 'story-feature related narrow', 'story-feature wide', 'story-feature narrow'. + # This class is used to add additional info. boxes, or small lists, outside of + # the main story. TO DO: Work out a way to incorporate these neatly. + story_feature_reg_exp = '^.*story[_ -]*feature.*$' + + # Regular expression to remove video and variant tags, Eg. 'videoInStoryB', + # 'videoInStoryC'. This class is used to embed video. + video_reg_exp = '^.*video.*$' + + # Regular expression to remove audio and variant tags, Eg. 'audioInStoryD'. + # This class is used to embed audio. + audio_reg_exp = '^.*audio.*$' + + # Regular expression to remove pictureGallery and variant tags, Eg. 'pictureGallery'. + # This class is used to embed a photo slideshow. See also 'slideshow' below. + picture_gallery_reg_exp = '^.*picture.*$' + + # Regular expression to remove slideshow and variant tags, Eg. 'dslideshow-enclosure'. + # This class is used to embed a slideshow (not necessarily photo) but both + # 'slideshow' and 'pictureGallery' are used for slideshows. + slideshow_reg_exp = '^.*slide[_ -]*show.*$' + + # Regular expression to remove social-links and variant tags. This class is used to + # display links to a BBC bloggers main page, used in various columnist's blogs + # (Eg. Nick Robinson, Robert Preston). + social_links_reg_exp = '^.*social[_ -]*links.*$' + + # Regular expression to remove quote and (multi) variant tags, Eg. 'quote', + # 'endquote', 'quote-credit', 'quote-credit-title', etc. These are usually + # removed by 'story-feature' removal (as they are usually within them), but + # not always. The quotation removed is always (AFAICT) in the article text + # as well but a 2nd copy is placed in a quote tag to draw attention to it. + # The quote class tags may or may not appear in div's. + quote_reg_exp = '^.*quote.*$' + + # Regular expression to remove hidden and variant tags, Eg. 'hidden'. + # The purpose of these is unclear, they seem to be an internal link to a + # section within the article, but the text of the link (Eg. 'Continue reading + # the main story') never seems to be displayed anyway. Removed to avoid clutter. + # The hidden class tags may or may not appear in div's. + hidden_reg_exp = '^.*hidden.*$' + + # Regular expression to remove comment and variant tags, Eg. 'comment-introduction'. + # Used on the site to display text about registered users entering comments. + comment_reg_exp = '^.*comment.*$' + + # Regular expression to remove form and variant tags, Eg. 'comment-form'. + # Used on the site to allow registered BBC users to fill in forms, typically + # for entering comments about an article. + form_reg_exp = '^.*form.*$' + + # Extra things to remove due to the addition of 'blq_content' in keep_only_tags. + + #
Used on sports pages for 'email' and 'print'. + story_actions_reg_exp = '^.*story[_ -]*actions.*$' + + #
Used on sports pages instead of 'share-help' (for + # social networking links). + bookmark_list_reg_exp = '^.*bookmark[_ -]*list.*$' + + #
+ # NOTE: Don't remove class="content-group" that is needed. + # Used on sports pages to link to 'similar stories'. + secondary_content_reg_exp = '^.*secondary[_ -]*content.*$' + + #