diff --git a/Changelog.yaml b/Changelog.yaml index 1ec61fa0b5..d8a785dd43 100644 --- a/Changelog.yaml +++ b/Changelog.yaml @@ -4,6 +4,223 @@ # for important features/bug fixes. # Also, each release can have new and improved recipes. +- version: 0.6.42 + date: 2010-02-20 + + bug fixes: + - title: "Fix regression that broke catalog generation from the Graphical User Interface in 0.6.41" + + - title: "Fix right edge of comics like Dilbert and xkcd getting cut off on the SONY reader. More generally, take page margins into account when rescaling images to fit in the selected output profile." + + +- version: 0.6.41 + date: 2010-02-19 + + new features: + - title: "Make calibre timezone aware. This required lots of internal changes, so I may have broken something" + type: major + + - title: "Allow editing of metadata in DRMed MOBI files" + type: major + + - title: "ebook-convert: Allow passing URLs as argument to --cover" + tickets: [4909] + + - title: "OS X/linux driver for EB511" + + - title: "ebook-meta: Allow changing of published date" + + - title: "Make replacing of files in ZIP archives faster and (hopefully) more robust" + + - title: "Speed optimization for viewing large EPUB files" + + - title: "Speed up parsing of OPF files" + tickets: [4908] + + bug fixes: + - title: "Fix drag and drop of multiple books to OS X dock icon" + tickets: [4849] + + - title: "MOBI Output: Encode titles as UTF-8 in the PalmDoc header as well as the EXTH header, since there are apparently MOBI readers that use the title from the PalmDoc header in preference to the title from the EXTH header." + + - title: "MOBI Output: Remove soft hyphens as the Kindle doesn't support them." + tickets: [4887] + + - title: "Fix Boox main mem and SD card swapped on windows" + + - title: "Fix sending large ebook fiels to devices" + tickets: [4896] + + - title: "EPUB Output: Strip invalid anchors from NCX TOC as Adobe Digital Editions cries when it sees one" + tickets: [4907] + + - title: "EPUB metadata: Don't set title_sort as a file_as attribute, as the brain-dead OPF spec doesn't allow this" + + - title: "Make publishing the content server via mDNS a little more robust" + + - title: "Content server: Use new exact matching for greater precision when generating OPDS catalogs. Also fix regression that broke rowsing by Tags on Stanza." + + - title: "Proper fix for breakage in LRF viewer caused by API change in QGraphicsItem in Qt 4.6" + + new recipes: + - title: Various Polish news sources + author: Tomaz Dlugosz + + - title: Que Leer, Wired UK + author: Darko Miletic + + - title: Kathermini and Ta Nea + author: Pan + + - title: Winter Olympics + author: Starson17 + + improved recipes: + - Wired Magazine + +- version: 0.6.40 + date: 2010-02-12 + + new features: + - title: "Ability to perform exact match and regular expression based searches." + type: major + tickets: [4830] + description: > + "You can now perform exact match searches by prefixing your search term with an =. + So for example, tag:=fiction will match all tags named fiction, but not tags named + non-fiction. Similarly, you can use regular expression based searches by prefixing + the search term by ~." + + - title: "Autodetect if a zip/rar file is actually a comic and if so, import it as CBZ/CBR" + tickets: [4753] + + - title: "Add plugin to automatically extract an ebook during import if it is in a zip/rar archive" + + - title: "Linux source install: Install a calibre environment module to ease the integration of calibre into other python projects" + + bug fixes: + - title: "Fix regression in 0.6.39 that broke the LRF viewer" + + - title: "ZIP/EPUB files: Try to detect file name encoding instead of assuming the name is encoded in UTF-8. Also correctly + encode the extracted file name in the local filesystem encoding." + + - title: "HTML Input: Handle HTML fragments more gracefully" + tickets: [4854] + + - title: "Zip files: Workaround invalid zip files that contain end-of-file comments but set comment size to zero" + + - title: "Restore the recipe for the Wired daily feed." + tickets: [4871] + + - title: "MOBI metadata: Preserve original EXTH records when not overwrriten by calibre metadata." + + - title: "Catalog generation: Improved series sorting. All books not in a series are now grouped together" + + - title: "Fix occassional threading related crash when using the ChooseFormatDialog" + + - title: "Catalog generation: Various fixes for handling invalid data" + + new recipes: + - title: Sueddeustche Zeitung + author: Darko Miletic + + improved recipes: + - Pagina 12 + - Variety + - Toronto Sun + - Telegraph UK + - Danas + - Dilbert + +- version: 0.6.39 + date: 2010-02-09 + + new features: + - title: "Add ability to control how author sort strings are automatically generated from author strings, via the config file tweaks.py" + + - title: "Handle broken EPUB files from Project Gutenberg that have invalid OCF containers" + tickets: [4832] + + bug fixes: + - title: "Fix regression in 0.6.38 that broke setting bookmarks in the viewer" + + - title: "HTML Input: Ignore filenames that are encoded incorerctly." + + new recipes: + + - title: Radikal + author: Darko Miletic + + +- version: 0.6.38 + date: 2010-02-09 + + new features: + - title: "Driver for the Irex DR 800" + + - title: "Driver for the Booq e-book reader" + + - title: "Allow automatic series increment algorithm to be tweaked by editing the config file tweaks.py" + + - title: "Various improvements to the catlog generation. Larger thumbnails in EPUB output and better series sorting. Better handling of html markup in the comments." + + - title: "MOBI Output: Make font used for generated masthead images user customizable." + + bug fixes: + - title: "E-book viewer: Make bookmarking (and remebering last open position more robust). For linuxsource installs, you must have Qt 4.6" + tickets: [4812] + + - title: "Fix conversion/import of HTML files with very long href links on windows" + tickets: [4783] + + - title: "Don't read metadata from filenames for download news, even if the user has the read metadata from filename option set" + tickets: [4758] + + - title: "Don't allow leading or trailing space in tags and series. Also normalize all internal spaces to a single space" + tickets: [4809] + + - title: "E-book viewer: Toolbars remember their position" + tickets: [4811] + + - title: "Fix year being repeated when editing date in main library screen on windows" + tickets: [4829] + + - title: "New download: Fix downloading of images from URLs with an ampersand in them" + + - title: "Linux source install: unbundle cssutils, it is now an external dependancy" + + - title: "MOBI metadata: Fix regression that broke setting of titles in some MOBI files" + + - title: "EPUB metadata: Extract the cover image from the html it is embededd in if possible, instead of rendering the html. Removes the white margins on covers and speeds up cover extraction" + + - title: "Fix regression in PDB output" + + - title: "News download: Remove tags automatically" + + - title: "Searching on device: Ignore unicode errors" + + + new recipes: + - title: Courier Press + author: Krittika Goyal + + - title: zive.sk and iliterature.cz + author: Abelturd + + - title: El Comerico, Digital Spy UK, Gizmodo, News Straits Times, Read It Later, TidBits + author: Darko Miletic + + improved recipes: + - Jerusalem Post + - Clarin + - La Nacion + - Harvard Business Review + - People US Mashup + - The New Republic + - "Pagina 12" + - Discover Magazine + - Metro Montreal + - version: 0.6.37 date: 2010-02-01 diff --git a/resources/catalog/stylesheet.css b/resources/catalog/stylesheet.css index 12063b121f..06bbf8eaab 100644 --- a/resources/catalog/stylesheet.css +++ b/resources/catalog/stylesheet.css @@ -79,3 +79,9 @@ p.unread_book { text-indent:-2em; } +hr.series_divider { + width:50%; + margin-left:1em; + margin-top:0em; + margin-bottom:0em; + } diff --git a/resources/default_tweaks.py b/resources/default_tweaks.py new file mode 100644 index 0000000000..77cfaaedf5 --- /dev/null +++ b/resources/default_tweaks.py @@ -0,0 +1,27 @@ +#!/usr/bin/env python +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +__license__ = 'GPL v3' +__copyright__ = '2010, Kovid Goyal ' +__docformat__ = 'restructuredtext en' + +''' +Contains various tweaks that affect calibre behavior. Only edit this file if +you know what you are dong. If you delete this file, it will be recreated from +defaults. +''' + + +# The algorithm used to assign a new book in an existing series a series number. +# Possible values are: +# next - Next available number +# const - Assign the number 1 always +series_index_auto_increment = 'next' + + + +# The algorithm used to copy author to author_sort +# Possible values are: +# invert: use "fn ln" -> "ln, fn" (the original algorithm) +# copy : copy author to author_sort without modification +# comma : use 'copy' if there is a ',' in the name, otherwise use 'invert' +author_sort_copy_method = 'invert' diff --git a/resources/images/catalog.svg b/resources/images/catalog.svg new file mode 100644 index 0000000000..724e0cf8e5 --- /dev/null +++ b/resources/images/catalog.svg @@ -0,0 +1,157 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/resources/images/library.png b/resources/images/library.png index e093247162..721ef0546d 100644 Binary files a/resources/images/library.png and b/resources/images/library.png differ diff --git a/resources/images/news/radikal_tr.png b/resources/images/news/radikal_tr.png new file mode 100644 index 0000000000..60932df762 Binary files /dev/null and b/resources/images/news/radikal_tr.png differ diff --git a/resources/images/news/sueddeutschezeitung.png b/resources/images/news/sueddeutschezeitung.png new file mode 100644 index 0000000000..f6ed36cd91 Binary files /dev/null and b/resources/images/news/sueddeutschezeitung.png differ diff --git a/resources/images/news/wired_uk.png b/resources/images/news/wired_uk.png new file mode 100644 index 0000000000..c807e36d1f Binary files /dev/null and b/resources/images/news/wired_uk.png differ diff --git a/resources/kathemerini.recipe b/resources/kathemerini.recipe new file mode 100644 index 0000000000..b68a35d0a8 --- /dev/null +++ b/resources/kathemerini.recipe @@ -0,0 +1,37 @@ +from calibre.web.feeds.recipes import BasicNewsRecipe + +class Kathimerini(BasicNewsRecipe): + title = 'Kathimerini' + __author__ = 'Pan' + description = 'News from Greece' + max_articles_per_feed = 100 + oldest_article = 100 + publisher = 'Kathimerini' + category = 'news, GR' + language = 'el' + no_stylesheets = True + remove_tags_before = dict(name='td',attrs={'class':'news'}) + remove_tags_after = dict(name='td',attrs={'class':'news'}) + remove_attributes = ['width', 'src','header','footer'] + + feeds = [(u'\u03a0\u03bf\u03bb\u03b9\u03c4\u03b9\u03ba\u03ae', + 'http://wk.kathimerini.gr/xml_files/politics.xml'), + (u'\u0395\u03bb\u03bb\u03ac\u03b4\u03b1', + ' http://wk.kathimerini.gr/xml_files/ell.xml'), + (u'\u039a\u03cc\u03c3\u03bc\u03bf\u03c2', + ' http://wk.kathimerini.gr/xml_files/world.xml'), + (u'\u039f\u03b9\u03ba\u03bf\u03bd\u03bf\u03bc\u03af\u03b1', + 'http://wk.kathimerini.gr/xml_files/economy_1.xml'), + (u'\u0395\u03c0\u03b9\u03c7\u03b5\u03b9\u03c1\u03ae\u03c3\u03b5\u03b9\u03c2', + 'http://wk.kathimerini.gr/xml_files/economy_2.xml'), + (u'\u0394\u03b9\u03b5\u03b8\u03bd\u03ae\u03c2 \u039f\u03b9\u03ba\u03bf\u03bd\u03bf\u03bc\u03af\u03b1', + 'http://wk.kathimerini.gr/xml_files/economy_3.xml'), + (u'\u03a0\u03bf\u03bb\u03b9\u03c4\u03b9\u03c3\u03bc\u03cc\u03c2', + 'http://wk.kathimerini.gr/xml_files/civ.xml'), + (u'\u039c\u03cc\u03bd\u03b9\u03bc\u03b5\u03c2 \u03a3\u03c4\u03ae\u03bb\u03b5\u03c2', + 'http://wk.kathimerini.gr/xml_files/st.xml')] + + def print_version(self, url): + return url.replace('http://news.kathimerini.gr/4dcgi/', 'http://news.kathimerini.gr/4dcgi/4dcgi/') + + diff --git a/resources/recipes/clarin.recipe b/resources/recipes/clarin.recipe index 7c0bf7b7ef..3a96bca162 100644 --- a/resources/recipes/clarin.recipe +++ b/resources/recipes/clarin.recipe @@ -1,6 +1,6 @@ __license__ = 'GPL v3' -__copyright__ = '2008-2009, Darko Miletic ' +__copyright__ = '2008-2010, Darko Miletic ' ''' clarin.com ''' @@ -21,7 +21,8 @@ class Clarin(BasicNewsRecipe): cover_url = strftime('http://www.clarin.com/diario/%Y/%m/%d/portada.jpg') encoding = 'cp1252' language = 'es' - extra_css = ' .Txt{ font-family: sans-serif } .Volan{ font-family: sans-serif; font-size: x-small} .Pie{ font-family: sans-serif; font-size: x-small} .Copete{font-family: sans-serif; font-size: large} .Hora{font-family: sans-serif; font-size: large} .Autor{font-family: sans-serif; font-size: small} ' + masthead_url = 'http://www.clarin.com/shared/v10/img/Hd/lg_Clarin.gif' + extra_css = ' body{font-family: Arial,Helvetica,sans-serif} h2{font-family: Georgia,"Times New Roman",Times,serif; font-size: xx-large} .Volan,.Pie,.Autor{ font-size: x-small} .Copete,.Hora{font-size: large} ' conversion_options = { 'comment' : description diff --git a/resources/recipes/courrier.recipe b/resources/recipes/courrier.recipe new file mode 100644 index 0000000000..d5559a5fca --- /dev/null +++ b/resources/recipes/courrier.recipe @@ -0,0 +1,26 @@ +from calibre.web.feeds.news import BasicNewsRecipe +from calibre.ebooks.BeautifulSoup import BeautifulSoup + +class CourierPress(BasicNewsRecipe): + title = u'Courier Press' + language = 'en' + __author__ = 'Krittika Goyal' + oldest_article = 1 #days + max_articles_per_feed = 25 + + remove_stylesheets = True + remove_tags = [ + dict(name='iframe'), + ] + + feeds = [ +('Courier Press', + 'http://www.courierpress.com/rss/headlines/news/'), +] + + def preprocess_html(self, soup): + story = soup.find(name='div', attrs={'id':'article_body'}) + soup = BeautifulSoup('t') + body = soup.find(name='body') + body.insert(0, story) + return soup diff --git a/resources/recipes/danas.recipe b/resources/recipes/danas.recipe index 4de308a57d..081c46a5d2 100644 --- a/resources/recipes/danas.recipe +++ b/resources/recipes/danas.recipe @@ -1,64 +1,63 @@ -#!/usr/bin/env python - __license__ = 'GPL v3' -__copyright__ = '2008-2009, Darko Miletic ' +__copyright__ = '2008-2010, Darko Miletic ' ''' danas.rs ''' + import re from calibre.web.feeds.news import BasicNewsRecipe -from calibre.ebooks.BeautifulSoup import Tag class Danas(BasicNewsRecipe): title = 'Danas' __author__ = 'Darko Miletic' - description = 'Vesti' + description = 'Dnevne novine sa vestima iz sveta, politike, ekonomije, kulture, sporta, Beograda, Novog Sada i cele Srbije.' publisher = 'Danas d.o.o.' category = 'news, politics, Serbia' oldest_article = 2 max_articles_per_feed = 100 no_stylesheets = False use_embedded_content = False + encoding = 'utf-8' + masthead_url = 'http://www.danas.rs/images/basic/danas.gif' language = 'sr' - lang = 'sr-Latn-RS' - direction = 'ltr' - extra_css = '@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)} body{font-family: serif1, serif} .article_description{font-family: sans1, sans-serif}' + extra_css = '@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)} .article_description,body,.lokacija{font-family: Tahoma,Arial,Helvetica,sans1,sans-serif} .nadNaslov,h1,.preamble{font-family: Georgia,"Times New Roman",Times,serif1,serif} .antrfileText{border-left: 2px solid #999999; color:#666666; margin-left: 0.8em; padding-left: 1.2em; margin-bottom: 0; margin-top: 0} h2,.datum,.lokacija,.autor{font-size: small} .antrfileNaslov{border-left: 2px solid #999999; color:#666666; margin-left: 0.8em; padding-left: 1.2em; font-weight:bold; margin-bottom: 0; margin-top: 0} img{margin-bottom: 0.8em} ' conversion_options = { 'comment' : description , 'tags' : category , 'publisher' : publisher , 'language' : language - , 'pretty_print' : True } - preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] keep_only_tags = [dict(name='div', attrs={'id':'left'})] remove_tags = [ dict(name='div', attrs={'class':['width_1_4','metaClanka','baner']}) ,dict(name='div', attrs={'id':'comments'}) - ,dict(name=['object','link']) + ,dict(name=['object','link','iframe']) ] - feeds = [ - (u'Vesti' , u'http://www.danas.rs/rss/rss.asp' ) - ,(u'Periskop', u'http://www.danas.rs/rss/rss.asp?column_id=4') + feeds = [ + (u'Politika' , u'http://www.danas.rs/rss/rss.asp?column_id=27') + ,(u'Hronika' , u'http://www.danas.rs/rss/rss.asp?column_id=2' ) + ,(u'Drustvo' , u'http://www.danas.rs/rss/rss.asp?column_id=24') + ,(u'Dijalog' , u'http://www.danas.rs/rss/rss.asp?column_id=1' ) + ,(u'Ekonomija', u'http://www.danas.rs/rss/rss.asp?column_id=6' ) + ,(u'Svet' , u'http://www.danas.rs/rss/rss.asp?column_id=25') + ,(u'Srbija' , u'http://www.danas.rs/rss/rss.asp?column_id=28') + ,(u'Kultura' , u'http://www.danas.rs/rss/rss.asp?column_id=5' ) + ,(u'Sport' , u'http://www.danas.rs/rss/rss.asp?column_id=13') + ,(u'Scena' , u'http://www.danas.rs/rss/rss.asp?column_id=42') + ,(u'Feljton' , u'http://www.danas.rs/rss/rss.asp?column_id=19') + ,(u'Periskop' , u'http://www.danas.rs/rss/rss.asp?column_id=4' ) ] def preprocess_html(self, soup): - mlang = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.lang)]) - soup.head.insert(0,mlang) - attribs = [ 'style','font','valign' - ,'colspan','width','height' - ,'rowspan','summary','align' - ,'cellspacing','cellpadding' - ,'frames','rules','border' - ] - for item in soup.body.findAll(name=['table','td','tr','th','caption','thead','tfoot','tbody','colgroup','col']): - item.name = 'div' - for attrib in attribs: - if item.has_key(attrib): - del item[attrib] + for item in soup.findAll(style=True): + del item['style'] return soup + + def print_version(self, url): + return url + '&action=print' + diff --git a/resources/recipes/di.recipe b/resources/recipes/di.recipe new file mode 100644 index 0000000000..3b3f6e5c7c --- /dev/null +++ b/resources/recipes/di.recipe @@ -0,0 +1,60 @@ +#!/usr/bin/env python + +__license__ = 'GPL v3' +__author__ = 'Mori' +__version__ = 'v. 0.5' +''' +di.com.pl +''' + +from calibre.web.feeds.news import BasicNewsRecipe +import re + +class DziennikInternautowRecipe(BasicNewsRecipe): + __author__ = 'Mori' + language = 'pl' + + title = u'Dziennik Internautow' + publisher = u'Dziennik Internaut\xc3\xb3w Sp. z o.o.' + description =u'Internet w \xc5\xbcyciu i biznesie. Porady, wywiady, interwencje, bezpiecze\xc5\x84stwo w Sieci, technologia.' + + max_articles_per_feed = 100 + oldest_article = 7 + cover_url = 'http://di.com.pl/pic/logo_di_norm.gif' + + no_stylesheets = True + remove_javascript = True + encoding = 'utf-8' + + extra_css = ''' + .fotodesc{font-size: 75%;} + .pub_data{font-size: 75%;} + .fotonews{clear: both; padding-top: 10px; padding-bottom: 10px;} + #pub_foto{font-size: 75%; float: left; padding-right: 10px;} + ''' + + feeds = [ + (u'Dziennik Internautów', u'http://feeds.feedburner.com/glowny-di') + ] + + keep_only_tags = [ + dict(name = 'div', attrs = {'id' : 'pub_head'}), + dict(name = 'div', attrs = {'id' : 'pub_content'}) + ] + + remove_tags = [ + dict(name = 'div', attrs = {'class' : 'poradniki_context'}), + dict(name = 'div', attrs = {'class' : 'uniBox'}), + dict(name = 'object', attrs = {}), + dict(name = 'h3', attrs = {}) + ] + + preprocess_regexps = [ + (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in + [ + (r', ', lambda match: ''), + (r'http://di.com.pl/pic/photo/mini/', lambda match: 'http://di.com.pl/pic/photo/oryginal/'), + (r'\s*', lambda match: ''), + ] + ] diff --git a/resources/recipes/dilbert.recipe b/resources/recipes/dilbert.recipe index ddca52b40a..82966b1d15 100644 --- a/resources/recipes/dilbert.recipe +++ b/resources/recipes/dilbert.recipe @@ -3,6 +3,7 @@ __copyright__ = '2009, Darko Miletic ' ''' http://www.dilbert.com ''' +import re from calibre.web.feeds.recipes import BasicNewsRecipe @@ -28,6 +29,12 @@ class DosisDiarias(BasicNewsRecipe): feeds = [(u'Dilbert', u'http://feeds.dilbert.com/DilbertDailyStrip' )] + preprocess_regexps = [ + (re.compile('strip\..*\.gif', re.DOTALL|re.IGNORECASE), + lambda match: 'strip.zoom.gif') + ] + + def get_article_url(self, article): return article.get('feedburner_origlink', None) diff --git a/resources/recipes/eclicto.recipe b/resources/recipes/eclicto.recipe new file mode 100644 index 0000000000..a1c625b21c --- /dev/null +++ b/resources/recipes/eclicto.recipe @@ -0,0 +1,49 @@ +#!/usr/bin/env python + +__license__ = 'GPL v3' +__author__ = 'Mori' +__version__ = 'v. 0.1' +''' +blog.eclicto.pl +''' + +from calibre.web.feeds.news import BasicNewsRecipe +import re + +class BlogeClictoRecipe(BasicNewsRecipe): + __author__ = 'Mori' + language = 'pl' + + title = u'Blog eClicto' + publisher = u'Blog eClicto' + description = u'Blog o e-papierze i e-bookach' + + max_articles_per_feed = 100 + cover_url = 'http://blog.eclicto.pl/wordpress/wp-content/themes/blog_eclicto/g/logo.gif' + + no_stylesheets = True + remove_javascript = True + encoding = 'utf-8' + + extra_css = ''' + img{float: left; padding-right: 10px; padding-bottom: 5px;} + ''' + + feeds = [ + (u'Blog eClicto', u'http://blog.eclicto.pl/feed/') + ] + + remove_tags = [ + dict(name = 'span', attrs = {'id' : 'tags'}) + ] + + remove_tags_after = [ + dict(name = 'div', attrs = {'class' : 'post'}) + ] + + preprocess_regexps = [ + (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in + [ + (r'\s*', lambda match: ''), + ] + ] diff --git a/resources/recipes/eksiazki.recipe b/resources/recipes/eksiazki.recipe new file mode 100644 index 0000000000..248cead215 --- /dev/null +++ b/resources/recipes/eksiazki.recipe @@ -0,0 +1,26 @@ +#!/usr/bin/env python + +__license__ = 'GPL v3' +__copyright__ = u'2010, Tomasz Dlugosz ' +''' +eksiazki.org +''' + +from calibre.web.feeds.news import BasicNewsRecipe + +class eksiazki(BasicNewsRecipe): + + title = u'eKsiazki.org' + desciption = u'Twoje centrum wiedzy o ePapierze i eBookach' + language = 'pl' + __author__ = u'Tomasz D\u0142ugosz' + no_stylesheets = True + remove_javascript = True + + feeds = [(u'eKsiazki.org', u'http://www.eksiazki.org/feed/')] + + keep_only_tags = [dict(name='div', attrs={'id':'content-body'})] + remove_tags = [ + dict(name='span', attrs={'class':'nr_comm'}), + dict(name='div', attrs={'id':'tabsContainer'}), + dict(name='div', attrs={'class':'next_previous_links'})] diff --git a/resources/recipes/houston_chronicle.recipe b/resources/recipes/houston_chronicle.recipe index 77e35dfc0c..3ec1abbf0f 100644 --- a/resources/recipes/houston_chronicle.recipe +++ b/resources/recipes/houston_chronicle.recipe @@ -1,17 +1,41 @@ +#!/usr/bin/env python +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai from calibre.web.feeds.news import BasicNewsRecipe class HoustonChronicle(BasicNewsRecipe): title = u'The Houston Chronicle' description = 'News from Houston, Texas' - __author__ = 'Kovid Goyal' + __author__ = 'Kovid Goyal and Sujata Raman' language = 'en' timefmt = ' [%a, %d %b, %Y]' no_stylesheets = True - keep_only_tags = [dict(id=['story-head', 'story'])] - remove_tags = [dict(id=['share-module', 'resource-box', - 'resource-box-header'])] + keep_only_tags = [ + dict(id=['story-head', 'story']) + ] + + remove_tags = [ + dict(id=['share-module', 'resource-box', + 'resource-box-header']) + ] + + extra_css = ''' + h1{font-family :Arial,Helvetica,sans-serif; font-size:large;} + h2{font-family :Arial,Helvetica,sans-serif; font-size:medium; color:#666666;} + h3{font-family :Arial,Helvetica,sans-serif; font-size:medium; color:#000000;} + h4{font-family :Arial,Helvetica,sans-serif; font-size: x-small;} + p{font-family :Arial,Helvetica,sans-serif; font-size:x-small;} + #story-head h1{font-family :Arial,Helvetica,sans-serif; font-size: xx-large;} + #story-head h2{font-family :Arial,Helvetica,sans-serif; font-size: small; color:#000000;} + #story-head h3{font-family :Arial,Helvetica,sans-serif; font-size: xx-small;} + #story-head h4{font-family :Arial,Helvetica,sans-serif; font-size: xx-small;} + #story{font-family :Arial,Helvetica,sans-serif; font-size:xx-small;} + #Text-TextSubhed BoldCond PoynterAgateZero h3{color:#444444;font-family :Arial,Helvetica,sans-serif; font-size:small;} + .p260x p{font-family :Arial,Helvetica,serif; font-size:x-small;font-style:italic;} + .p260x h6{color:#777777;font-family :Arial,Helvetica,sans-serif; font-size:xx-small;} + ''' + def parse_index(self): soup = self.index_to_soup('http://www.chron.com/news/') @@ -64,3 +88,6 @@ class HoustonChronicle(BasicNewsRecipe): feeds.append((current_section, current_articles)) return feeds + + + diff --git a/resources/recipes/interia_fakty.recipe b/resources/recipes/interia_fakty.recipe new file mode 100644 index 0000000000..cdd245fdd7 --- /dev/null +++ b/resources/recipes/interia_fakty.recipe @@ -0,0 +1,38 @@ +#!/usr/bin/env python + +__license__ = 'GPL v3' +__copyright__ = u'2010, Tomasz Dlugosz ' +''' +fakty.interia.pl +''' + +from calibre.web.feeds.news import BasicNewsRecipe + +class InteriaFakty(BasicNewsRecipe): + title = u'Interia.pl - Fakty' + language = 'pl' + oldest_article = 7 + __author__ = u'Tomasz D\u0142ugosz' + simultaneous_downloads = 2 + no_stylesheets = True + remove_javascript = True + max_articles_per_feed = 100 + + feeds = [(u'Kraj', u'http://kanaly.rss.interia.pl/kraj.xml'), + (u'\u015awiat', u'http://kanaly.rss.interia.pl/swiat.xml'), + (u'Wiadomo\u015bci dnia', u'http://kanaly.rss.interia.pl/fakty.xml'), + (u'Przegl\u0105d prasy', u'http://kanaly.rss.interia.pl/przeglad_prasy.xml'), + (u'Wywiady', u'http://kanaly.rss.interia.pl/wywiady.xml'), + (u'Ciekawostki', u'http://kanaly.rss.interia.pl/ciekawostki.xml')] + + keep_only_tags = [dict(name='div', attrs={'id':'article'})] + + remove_tags = [ + dict(name='div', attrs={'class':'box fontSizeSwitch'}), + dict(name='div', attrs={'class':'clear'}), + dict(name='div', attrs={'class':'embed embedLeft articleEmbedArticleList articleEmbedArticleListTitle'}), + dict(name='span', attrs={'class':'keywords'})] + + extra_css = ''' + h2 { font-size: 1.2em; } + ''' diff --git a/resources/recipes/interia_sport.recipe b/resources/recipes/interia_sport.recipe new file mode 100644 index 0000000000..9c72ee28a7 --- /dev/null +++ b/resources/recipes/interia_sport.recipe @@ -0,0 +1,71 @@ +#!/usr/bin/env python + +__license__ = 'GPL v3' +__copyright__ = u'2010, Tomasz Dlugosz ' +''' +sport.interia.pl +''' + +import re +from calibre.web.feeds.news import BasicNewsRecipe + +class InteriaSport(BasicNewsRecipe): + title = u'Interia.pl - Sport' + language = 'pl' + oldest_article = 7 + __author__ = u'Tomasz D\u0142ugosz' + simultaneous_downloads = 3 + no_stylesheets = True + remove_javascript = True + max_articles_per_feed = 100 + + feeds = [(u'Wydarzenia sportowe', u'http://kanaly.rss.interia.pl/sport.xml'), + (u'Pi\u0142ka no\u017cna', u'http://kanaly.rss.interia.pl/pilka_nozna.xml'), + (u'Siatk\xf3wka', u'http://kanaly.rss.interia.pl/siatkowka.xml'), + (u'Koszyk\xf3wka', u'http://kanaly.rss.interia.pl/koszykowka.xml'), + (u'NBA', u'http://kanaly.rss.interia.pl/nba.xml'), + (u'Kolarstwo', u'http://kanaly.rss.interia.pl/kolarstwo.xml'), + (u'\u017bu\u017cel', u'http://kanaly.rss.interia.pl/zuzel.xml'), + (u'Tenis', u'http://kanaly.rss.interia.pl/tenis.xml')] + + keep_only_tags = [dict(name='div', attrs={'id':'article'})] + + remove_tags = [dict(name='div', attrs={'class':'object gallery'})] + + extra_css = ''' + .articleDate { + font-size: 0.5em; + color: black; + } + + .articleFoto { + display: block; + font-family: sans; + font-size: 0.5em; + text-indent: 0 + color: black; + } + + .articleText { + display: block; + margin-bottom: 1em; + margin-left: 0; + margin-right: 0; + margin-top: 1em + color: black; + } + + .articleLead { + font-size: 1.2em; + } + ''' + + preprocess_regexps = [ + (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in + [ + (r'', lambda match: ''), + # FIXME + #(r'()(.*?)()(.*?)()', lambda match: '\1\2\4'), + (r'()?(ZOBACZ|CZYTAJ) T.*?', lambda match: '') + ] + ] diff --git a/resources/recipes/jpost.recipe b/resources/recipes/jpost.recipe index b4b7d19c3c..8f1cdf73f4 100644 --- a/resources/recipes/jpost.recipe +++ b/resources/recipes/jpost.recipe @@ -10,22 +10,19 @@ class JerusalemPost(BasicNewsRecipe): __author__ = 'Kovid Goyal' max_articles_per_feed = 10 no_stylesheets = True - remove_tags_before = {'class':'byline'} - remove_tags = [ - {'class':['artAdBlock clearboth', 'tbartop', 'divdot_vrttbox', - 'slideshow']}, - dict(id=['artFontButtons', 'artRelatedBlock']), - ] - remove_tags_after = {'id':'artTxtBlock'} - + remove_tags_before = {'class':'jp-grid-content'} + remove_tags_after = {'id':'body_val'} + feeds = [ ('Front Page', 'http://www.jpost.com/servlet/Satellite?pagename=JPost/Page/RSS&cid=1123495333346'), ('Israel News', 'http://www.jpost.com/servlet/Satellite?pagename=JPost/Page/RSS&cid=1178443463156'), ('Middle East News', 'http://www.jpost.com/servlet/Satellite?pagename=JPost/Page/RSS&cid=1123495333498'), ('International News', 'http://www.jpost.com/servlet/Satellite?pagename=JPost/Page/RSS&cid=1178443463144'), ('Editorials', 'http://www.jpost.com/servlet/Satellite?pagename=JPost/Page/RSS&cid=1123495333211'), ] - - def postprocess_html(self, soup, first): - for tag in soup.findAll(name=['table', 'tr', 'td']): - tag.name = 'div' - return soup \ No newline at end of file + + def preprocess_html(self, soup): + for x in soup.findAll(name=['form', 'input']): + x.name = 'div' + for x in soup.findAll('body', style=True): + del x['style'] + return soup diff --git a/resources/recipes/lanacion.recipe b/resources/recipes/lanacion.recipe index 298c980f00..000b4fb0f6 100644 --- a/resources/recipes/lanacion.recipe +++ b/resources/recipes/lanacion.recipe @@ -1,7 +1,5 @@ -#!/usr/bin/env python - __license__ = 'GPL v3' -__copyright__ = '2008-2009, Darko Miletic ' +__copyright__ = '2008-2010, Darko Miletic ' ''' lanacion.com.ar ''' @@ -12,28 +10,34 @@ class Lanacion(BasicNewsRecipe): title = 'La Nacion' __author__ = 'Darko Miletic' description = 'Noticias de Argentina y el resto del mundo' - publisher = 'La Nacion' + publisher = 'La Nacion S.A.' category = 'news, politics, Argentina' oldest_article = 2 max_articles_per_feed = 100 use_embedded_content = False - remove_javascript = True no_stylesheets = True + language = 'es' + encoding = 'cp1252' + masthead_url = 'http://www.lanacion.com.ar/imgs/layout/logos/ln341x47.gif' + extra_css = ' h1{font-family: Georgia,serif} body{font-family: Arial,sans-serif} img{margin-top: 0.5em; margin-bottom: 0.2em} .notaEpigrafe{font-size: x-small} ' - html2lrf_options = [ - '--comment', description - , '--category', category - , '--publisher', publisher - ] - html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"' + conversion_options = { + 'comment' : description + , 'tags' : category + , 'publisher': publisher + , 'language' : language + } keep_only_tags = [dict(name='div', attrs={'class':'nota floatFix'})] remove_tags = [ dict(name='div' , attrs={'class':'notaComentario floatFix noprint' }) ,dict(name='ul' , attrs={'class':'cajaHerramientas cajaTop noprint'}) ,dict(name='div' , attrs={'class':'cajaHerramientas noprint' }) + ,dict(attrs={'class':['titulosMultimedia','derecha','techo color']}) + ,dict(name=['iframe','embed','object']) ] + remove_attributes = ['height','width'] feeds = [ (u'Ultimas noticias' , u'http://www.lanacion.com.ar/herramientas/rss/index.asp?origen=2' ) @@ -51,10 +55,4 @@ class Lanacion(BasicNewsRecipe): ] def preprocess_html(self, soup): - mtag = '' - soup.head.insert(0,mtag) - for item in soup.findAll(style=True): - del item['style'] - return soup - - language = 'es' + return self.adeify_images(soup) diff --git a/resources/recipes/legeartis.recipe b/resources/recipes/legeartis.recipe new file mode 100644 index 0000000000..7d5de45035 --- /dev/null +++ b/resources/recipes/legeartis.recipe @@ -0,0 +1,43 @@ +#!/usr/bin/env python + +__license__ = 'GPL v3' +__author__ = 'Mori' +__version__ = 'v. 0.1' +''' +olgierd.bblog.pl +''' + +from calibre.web.feeds.news import BasicNewsRecipe + +class LegeArtisRecipe(BasicNewsRecipe): + __author__ = 'Mori' + language = 'pl' + + title = u'Lege Artis' + publisher = u'Olgierd Rudak' + description = u'Wszystko, co chcieliby\xc5\x9bcie wiedzie\xc4\x87 o prawie, ale wstydzicie si\xc4\x99 zapyta\xc4\x87' + + max_articles_per_feed = 100 + + no_stylesheets = True + remove_javascript = True + + extra_css = ''' + img{clear: both;} + ''' + + feeds = [ + (u'Lege Artis', u'http://olgierd.bblog.pl/rss/rss20.xml') + ] + + keep_only_tags = [ + dict(name = 'div', attrs = {'class' : 'post_title'}), + dict(name = 'div', attrs = {'class' : 'post_date'}), + dict(name = 'div', attrs = {'class' : 'post_content'}) + ] + + remove_tags = [ + dict(name = 'div', attrs = {'id' : 'bb_tools'}), + dict(name = 'div', attrs = {'class' : 'post_comments'}), + dict(name = 'object', attrs = {}) + ] diff --git a/resources/recipes/legitymizm.recipe b/resources/recipes/legitymizm.recipe new file mode 100644 index 0000000000..468f1b1d6b --- /dev/null +++ b/resources/recipes/legitymizm.recipe @@ -0,0 +1,49 @@ +#!/usr/bin/env python + +__license__ = 'GPL v3' +__copyright__ = u'2010, Tomasz Dlugosz ' +''' +legitymizm.org +''' + +from calibre.web.feeds.news import BasicNewsRecipe + +class Legitymizm(BasicNewsRecipe): + title = u'Organizacja Monarchist\xf3w Polskich' + language = 'pl' + oldest_article = 7 + __author__ = u'Tomasz D\u0142ugosz' + max_articles_per_feed = 100 + cover_url = 'http://www.legitymizm.org/img_omp/logo.gif' + no_stylesheets = True + + feeds = [(u'Aktualno\u015bci i publicystyka', u'http://www.legitymizm.org/rss.php')] + + keep_only_tags = [dict(name='div', attrs={'id':'szeroka_kolumna'})] + remove_tags = [dict(name = 'div', attrs = {'class' : 'koniec_tresci_wlasciwej'}), + dict(name = 'div', attrs = {'class' : 'return'})] + + extra_css = ''' + body { font-family: Georgia, 'Times New Roman', Times, serif; } + h1 { color: #898981; font-weight: normal; font-size: 26px; letter-spacing: -1px; line-height: 23px; text-align: left; } + h2, h3 { font-weight: normal; font-size: 20px; line-height: 23px; letter-spacing: -1px; margin: 0 0 3px 0; text-align: left; } + #szeroka_kolumna { float: left; line-height: 20px; } + #szeroka_kolumna ul.wykaz { list-style-type: none; margin: 0 0 1.2em 0; padding: 0; } + #szeroka_kolumna ul.wykaz li.wykaz_2 { font-weight: bold; margin: 0.6em 0 0 0; } + #szeroka_kolumna ul.wykaz a { text-decoration: none; } + #szeroka_kolumna ul.wykaz li.wykaz_1, #szeroka_kolumna ul.wykaz li.wykaz_2 ul li { list-style-type: square; color: #898981; text-transform: none; font-weight: normal; padding: 0; } + #szeroka_kolumna ul.wykaz li.wykaz_1 { margin: 0 0 0 1.3em; } + #szeroka_kolumna ul.wykaz li.wykaz_2 ul { margin: 0; padding: 0 0 0 1.3em; } + #szeroka_kolumna h3.autor { background-color: #898981; color: #f9f9f8; margin: -25px 0px 30px 0; text-align: left; padding: 0 0 0 2px; } + .tresc_wlasciwa { border-top: 1px solid #898981; padding: 30px 0px 0px 0px; position: relative; } + #cytat { font-size: 11px; line-height: 19px; font-style: italic; text-align: justify; } + #cytat img { width: 100px; height: 105px; float: right; margin: 3px 0 0 10px; } + .duzy_cytat { padding: 20px 20px 10px 20px; margin: 0 0 1.2em 0; } + #szeroka_kolumna img, #szeroka_kolumna object { padding: 3px; border: 1px solid #898981; } + #szeroka_kolumna img.ilustracja { margin: 0px 10px 0 0; float: left; } + p { margin: 0 0 1.2em 0; } + #cytat p.sentencja { margin: 0; } + #cytat p.sentencja:first-letter { font-size: 44px; line-height: 33px; margin: 0 2px 0 0; font-style: normal; float: left; display: block; } + p.autor { text-transform: uppercase; color: #898981; font-style: normal; text-align: left; } + ''' + diff --git a/resources/recipes/michalkiewicz.recipe b/resources/recipes/michalkiewicz.recipe new file mode 100644 index 0000000000..ba58ba432b --- /dev/null +++ b/resources/recipes/michalkiewicz.recipe @@ -0,0 +1,26 @@ +#!/usr/bin/env python + +__license__ = 'GPL v3' +__copyright__ = '2010, Tomasz Dlugosz ' +''' +michalkiewicz.pl +''' + +from calibre.web.feeds.news import BasicNewsRecipe + +# + +class michalkiewicz(BasicNewsRecipe): + title = u'Stanis\u0142aw Michalkiewicz' + desciption = u'Strona autorska * felietony * artyku\u0142y * komentarze' + __author__ = u'Tomasz D\u0142ugosz' + language = 'pl' + oldest_article = 7 + max_articles_per_feed = 100 + no_stylesheets = True + + keep_only_tags = [dict(name='div', attrs={'class':'modul_srodek'})] + remove_tags = [dict(name='ul', attrs={'class':'menu'})] + + feeds = [(u'Teksty', u'http://www.michalkiewicz.pl/rss.xml')] + diff --git a/resources/recipes/nczas.recipe b/resources/recipes/nczas.recipe new file mode 100644 index 0000000000..ccf963e43a --- /dev/null +++ b/resources/recipes/nczas.recipe @@ -0,0 +1,35 @@ +#!/usr/bin/env python + +__license__ = 'GPL v3' +__copyright__ = '2010, Tomasz Dlugosz ' +''' +nczas.com +''' + +from calibre.web.feeds.news import BasicNewsRecipe + +# + +class NCzas(BasicNewsRecipe): + title = u'Najwy\u017cszy Czas!' + desciption = u'Najwy\u017cszy Czas!\nwydanie internetowe' + __author__ = u'Tomasz D\u0142ugosz' + language = 'pl' + oldest_article = 7 + max_articles_per_feed = 100 + no_stylesheets = True + cover_url = 'http://nczas.com/wp-content/themes/default/grafika/logo.png' + + keep_only_tags = [dict(name='div', attrs={'class':'trescartykulu'})] + + feeds = [(u'Najwy\u017cszy Czas!', u'http://nczas.com/feed/')] + + def postprocess_html(self, soup, first): + + for tag in soup.findAll(name= 'img', alt=""): + tag.extract() + + for item in soup.findAll(align = "right"): + del item['align'] + + return soup diff --git a/resources/recipes/nytimes_sub.recipe b/resources/recipes/nytimes_sub.recipe index e3942469a4..d389ca4eea 100644 --- a/resources/recipes/nytimes_sub.recipe +++ b/resources/recipes/nytimes_sub.recipe @@ -37,7 +37,7 @@ class NYTimes(BasicNewsRecipe): dict(name=['script', 'noscript', 'style'])] encoding = decode no_stylesheets = True - extra_css = 'h1 {font: sans-serif large;}\n.byline {font:monospace;}' + extra_css = 'h1 {font-face:sans-serif; font-size:2em; font-weight:bold;}\n.byline {font:monospace;}\n.bold {font-weight:bold;}' def get_browser(self): br = BasicNewsRecipe.get_browser() diff --git a/resources/recipes/pagina12.recipe b/resources/recipes/pagina12.recipe index a5ee18a7ed..da16c1697b 100644 --- a/resources/recipes/pagina12.recipe +++ b/resources/recipes/pagina12.recipe @@ -15,14 +15,14 @@ class Pagina12(BasicNewsRecipe): publisher = 'La Pagina S.A.' category = 'news, politics, Argentina' oldest_article = 2 - max_articles_per_feed = 100 + max_articles_per_feed = 200 no_stylesheets = True encoding = 'cp1252' use_embedded_content = False language = 'es' remove_empty_feeds = True masthead_url = 'http://www.pagina12.com.ar/commons/imgs/logo-home.gif' - extra_css = ' body{font-family: Arial,Helvetica,sans-serif } h2{color: #028CCD} img{margin-bottom: 0.4em} .epigrafe{font-size: x-small; background-color: #EBEAE5; color: #565144 } .intro{font-size: 1.1em} ' + extra_css = ' body{font-family: Arial,Helvetica,sans-serif } img{margin-bottom: 0.4em} #autor{font-weight: bold} #fecha,#epigrafe{font-size: 0.9em; margin: 5px} #imagen{border: 1px solid black; margin: 0 0 1.25em 1.25em; width: 232px } ' conversion_options = { 'comment' : description @@ -45,7 +45,9 @@ class Pagina12(BasicNewsRecipe): ,(u'NO' , u'http://www.pagina12.com.ar/diario/rss/no.xml' ) ,(u'Las/12' , u'http://www.pagina12.com.ar/diario/rss/las12.xml' ) ,(u'Soy' , u'http://www.pagina12.com.ar/diario/rss/soy.xml' ) - ,(u'M2' , u'http://www.pagina12.com.ar/diario/rss/futuro.xml' ) + ,(u'Futuro' , u'http://www.pagina12.com.ar/diario/rss/futuro.xml' ) + ,(u'M2' , u'http://www.pagina12.com.ar/diario/rss/m2.xml' ) + ,(u'Rosario/12' , u'http://www.pagina12.com.ar/diario/rss/rosario.xml' ) ] def print_version(self, url): @@ -60,3 +62,7 @@ class Pagina12(BasicNewsRecipe): return image['src'] return None + def preprocess_html(self, soup): + for item in soup.findAll(style=True): + del item['style'] + return soup \ No newline at end of file diff --git a/resources/recipes/queleer.recipe b/resources/recipes/queleer.recipe new file mode 100644 index 0000000000..bef0bd8395 --- /dev/null +++ b/resources/recipes/queleer.recipe @@ -0,0 +1,56 @@ +__license__ = 'GPL v3' +__copyright__ = '2010, Darko Miletic ' +''' +www.que-leer.com +''' + +import re +from calibre.web.feeds.news import BasicNewsRecipe + +class QueLeer(BasicNewsRecipe): + title = 'Que Leer' + __author__ = 'Darko Miletic' + description = 'Libros, Novedades en libros, Criticas, Noticias libro' + publisher = 'MC Ediciones, S.A.' + category = 'news, books, criticas, libros' + oldest_article = 7 + max_articles_per_feed = 200 + no_stylesheets = True + encoding = 'utf-8' + use_embedded_content = False + language = 'es' + remove_empty_feeds = True + masthead_url = 'http://www.que-leer.com/wp-content/themes/queleer/images/backgrounds/que-leer.jpg' + extra_css = ' body{font-family: Arial,sans-serif } img{margin-bottom: 0.4em} ' + + conversion_options = { + 'comment' : description + , 'tags' : category + , 'publisher' : publisher + , 'language' : language + } + + preprocess_regexps = [(re.compile(r'.*?
()?(ZOBACZ|CZYTAJ) T.*?