From ba2b5056b017cdd55b780ccd7354b57830514f70 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Fri, 12 Feb 2010 09:50:48 -0700 Subject: [PATCH 1/3] Fix #4871 (Wired magazine seems broken) --- resources/recipes/wired_daily.recipe | 44 ++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) create mode 100644 resources/recipes/wired_daily.recipe diff --git a/resources/recipes/wired_daily.recipe b/resources/recipes/wired_daily.recipe new file mode 100644 index 0000000000..f06d28796e --- /dev/null +++ b/resources/recipes/wired_daily.recipe @@ -0,0 +1,44 @@ +#!/usr/bin/env python +__license__ = 'GPL v3' +__docformat__ = 'restructuredtext en' + + +from calibre.web.feeds.news import BasicNewsRecipe + +class Wired_Daily(BasicNewsRecipe): + + title = 'Wired Daily Edition' + __author__ = 'Kovid Goyal' + description = 'Technology news' + timefmt = ' [%Y%b%d %H%M]' + language = 'en' + + no_stylesheets = True + + remove_tags_before = dict(name='div', id='content') + remove_tags = [dict(id=['social_tools', 'outerWrapper', 'sidebar', + 'footer', 'advertisement', 'blog_subscription_unit', + 'brightcove_component']), + {'class':'entryActions'}, + dict(name=['noscript', 'script'])] + + feeds = [ + ('Top News', 'http://feeds.wired.com/wired/index'), + ('Culture', 'http://feeds.wired.com/wired/culture'), + ('Software', 'http://feeds.wired.com/wired/software'), + ('Mac', 'http://feeds.feedburner.com/cultofmac/bFow'), + ('Gadgets', 'http://feeds.wired.com/wired/gadgets'), + ('Cars', 'http://feeds.wired.com/wired/cars'), + ('Entertainment', 'http://feeds.wired.com/wired/entertainment'), + ('Gaming', 'http://feeds.wired.com/wired/gaming'), + ('Science', 'http://feeds.wired.com/wired/science'), + ('Med Tech', 'http://feeds.wired.com/wired/medtech'), + ('Politics', 'http://feeds.wired.com/wired/politics'), + ('Tech Biz', 'http://feeds.wired.com/wired/techbiz'), + ('Commentary', 'http://feeds.wired.com/wired/commentary'), + ] + + def print_version(self, url): + return url.replace('http://www.wired.com/', 'http://www.wired.com/print/') + + From a3052cf127092fe658d04f042e982500565d4f9b Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Fri, 12 Feb 2010 09:51:58 -0700 Subject: [PATCH 2/3] Fix #4876 (Updated recipes) --- resources/recipes/pagina12.recipe | 12 +++- resources/recipes/variety.recipe | 92 +++++++++++++++---------------- 2 files changed, 55 insertions(+), 49 deletions(-) diff --git a/resources/recipes/pagina12.recipe b/resources/recipes/pagina12.recipe index a5ee18a7ed..da16c1697b 100644 --- a/resources/recipes/pagina12.recipe +++ b/resources/recipes/pagina12.recipe @@ -15,14 +15,14 @@ class Pagina12(BasicNewsRecipe): publisher = 'La Pagina S.A.' category = 'news, politics, Argentina' oldest_article = 2 - max_articles_per_feed = 100 + max_articles_per_feed = 200 no_stylesheets = True encoding = 'cp1252' use_embedded_content = False language = 'es' remove_empty_feeds = True masthead_url = 'http://www.pagina12.com.ar/commons/imgs/logo-home.gif' - extra_css = ' body{font-family: Arial,Helvetica,sans-serif } h2{color: #028CCD} img{margin-bottom: 0.4em} .epigrafe{font-size: x-small; background-color: #EBEAE5; color: #565144 } .intro{font-size: 1.1em} ' + extra_css = ' body{font-family: Arial,Helvetica,sans-serif } img{margin-bottom: 0.4em} #autor{font-weight: bold} #fecha,#epigrafe{font-size: 0.9em; margin: 5px} #imagen{border: 1px solid black; margin: 0 0 1.25em 1.25em; width: 232px } ' conversion_options = { 'comment' : description @@ -45,7 +45,9 @@ class Pagina12(BasicNewsRecipe): ,(u'NO' , u'http://www.pagina12.com.ar/diario/rss/no.xml' ) ,(u'Las/12' , u'http://www.pagina12.com.ar/diario/rss/las12.xml' ) ,(u'Soy' , u'http://www.pagina12.com.ar/diario/rss/soy.xml' ) - ,(u'M2' , u'http://www.pagina12.com.ar/diario/rss/futuro.xml' ) + ,(u'Futuro' , u'http://www.pagina12.com.ar/diario/rss/futuro.xml' ) + ,(u'M2' , u'http://www.pagina12.com.ar/diario/rss/m2.xml' ) + ,(u'Rosario/12' , u'http://www.pagina12.com.ar/diario/rss/rosario.xml' ) ] def print_version(self, url): @@ -60,3 +62,7 @@ class Pagina12(BasicNewsRecipe): return image['src'] return None + def preprocess_html(self, soup): + for item in soup.findAll(style=True): + del item['style'] + return soup \ No newline at end of file diff --git a/resources/recipes/variety.recipe b/resources/recipes/variety.recipe index 7321e0ad33..9f0c445e6b 100644 --- a/resources/recipes/variety.recipe +++ b/resources/recipes/variety.recipe @@ -1,46 +1,46 @@ -#!/usr/bin/env python - -__license__ = 'GPL v3' -__copyright__ = '2009, Darko Miletic ' -''' -www.variety.com -''' - -from calibre.web.feeds.recipes import BasicNewsRecipe - -class Variety(BasicNewsRecipe): - title = 'Variety' - __author__ = 'Darko Miletic' - description = 'Breaking entertainment movie news, movie reviews, entertainment industry events, news and reviews from Cannes, Oscars, and Hollywood awards. Featuring box office charts, archives and more.' - oldest_article = 2 - max_articles_per_feed = 100 - no_stylesheets = True - use_embedded_content = False - encoding = 'cp1252' - publisher = 'Red Business Information' - category = 'Entertainment Industry News, Daily Variety, Movie Reviews, TV, Awards, Oscars, Cannes, Box Office, Hollywood' - language = 'en' - - conversion_options = { - 'comments' : description - ,'tags' : category - ,'language' : language - ,'publisher' : publisher - } - - remove_tags = [dict(name=['object','link','map'])] - - keep_only_tags = [dict(name='div', attrs={'id':'article'})] - - feeds = [(u'News & Articles', u'http://feeds.feedburner.com/variety/headlines' )] - - def print_version(self, url): - rpt = url.rpartition('?')[0] - artid = rpt.rpartition('/')[2] - catidr = url.rpartition('categoryid=')[2] - catid = catidr.partition('&')[0] - return 'http://www.variety.com/index.asp?layout=print_story&articleid=' + artid + '&categoryid=' + catid - - def get_article_url(self, article): - return article.get('feedburner_origlink', None) - +__license__ = 'GPL v3' +__copyright__ = '2009-2010, Darko Miletic ' +''' +www.variety.com +''' + +from calibre.web.feeds.recipes import BasicNewsRecipe + +class Variety(BasicNewsRecipe): + title = 'Variety' + __author__ = 'Darko Miletic' + description = 'Breaking entertainment movie news, movie reviews, entertainment industry events, news and reviews from Cannes, Oscars, and Hollywood awards. Featuring box office charts, archives and more.' + oldest_article = 2 + max_articles_per_feed = 100 + no_stylesheets = True + use_embedded_content = False + encoding = 'cp1252' + publisher = 'Red Business Information' + category = 'Entertainment Industry News, Daily Variety, Movie Reviews, TV, Awards, Oscars, Cannes, Box Office, Hollywood' + language = 'en' + masthead_url = 'http://a330.g.akamai.net/7/330/23382/20090528190853/www.variety.com/graphics/variety/Variety_logo_green_tm.gif' + extra_css = ' body{font-family: Georgia,"Times New Roman",Times,Courier,serif } img{margin-bottom: 1em} ' + + conversion_options = { + 'comments' : description + ,'tags' : category + ,'language' : language + ,'publisher' : publisher + } + + remove_tags = [dict(name=['object','link','map'])] + + keep_only_tags = [dict(name='div', attrs={'id':'article'})] + + feeds = [(u'News & Articles', u'http://feeds.feedburner.com/variety/headlines' )] + + def print_version(self, url): + rpt = url.rpartition('?')[0] + artid = rpt.rpartition('/')[2] + catidr = url.rpartition('categoryid=')[2] + catid = catidr.partition('&')[0] + return 'http://www.variety.com/index.asp?layout=print_story&articleid=' + artid + '&categoryid=' + catid + + + def preprocess_html(self, soup): + return self.adeify_images(soup) From a7b5f60f6fd8a6f97f26319575f409ca487cdd67 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Fri, 12 Feb 2010 10:27:45 -0700 Subject: [PATCH 3/3] Autodetect if a zip/rar file is actually a comic. Fixes #4880 (RAR/ZIP file Autodetection to treat them as CBR/CBZ) --- src/calibre/ebooks/metadata/archive.py | 19 +++++++++++++++---- src/calibre/ebooks/metadata/rar.py | 10 +++++++--- src/calibre/ebooks/metadata/zip.py | 10 ++++++++-- 3 files changed, 30 insertions(+), 9 deletions(-) diff --git a/src/calibre/ebooks/metadata/archive.py b/src/calibre/ebooks/metadata/archive.py index 6b71f41a88..f9e78e5dfa 100644 --- a/src/calibre/ebooks/metadata/archive.py +++ b/src/calibre/ebooks/metadata/archive.py @@ -6,17 +6,21 @@ __license__ = 'GPL v3' __copyright__ = '2010, Kovid Goyal ' __docformat__ = 'restructuredtext en' -import textwrap, os +import os from contextlib import closing from calibre.customize import FileTypePlugin +def is_comic(list_of_names): + extensions = set([x.rpartition('.')[-1].lower() for x in list_of_names]) + return len(extensions) == 1 and iter(extensions).next() in ('jpg', 'jpeg', 'png') + class ArchiveExtract(FileTypePlugin): name = 'Archive Extract' author = 'Kovid Goyal' - description = textwrap.dedent(_('''\ - Extract common e-book formats from archives (zip/rar) files. - ''')) + description = _('Extract common e-book formats from archives ' + '(zip/rar) files. Also try to autodetect if they are actually ' + 'cbz/cbr files.') file_types = set(['zip', 'rar']) supported_platforms = ['windows', 'osx', 'linux'] on_import = True @@ -35,6 +39,13 @@ class ArchiveExtract(FileTypePlugin): fnames = zf.namelist() fnames = [x for x in fnames if '.' in x] + if is_comic(fnames): + ext = '.cbr' if is_rar else '.cbz' + of = self.temporary_file('_archive_extract'+ext) + with open(archive, 'rb') as f: + of.write(f.read()) + of.close() + return of.name if len(fnames) > 1 or not fnames: return archive fname = fnames[0] diff --git a/src/calibre/ebooks/metadata/rar.py b/src/calibre/ebooks/metadata/rar.py index 16f2c67af7..896e3d7777 100644 --- a/src/calibre/ebooks/metadata/rar.py +++ b/src/calibre/ebooks/metadata/rar.py @@ -13,6 +13,9 @@ from calibre.ptempfile import PersistentTemporaryFile from calibre.libunrar import extract_member, names def get_metadata(stream): + from calibre.ebooks.metadata.archive import is_comic + from calibre.ebooks.metadata.meta import get_metadata + path = getattr(stream, 'name', False) if not path: pt = PersistentTemporaryFile('_rar-meta.rar') @@ -21,6 +24,8 @@ def get_metadata(stream): path = pt.name path = os.path.abspath(path) file_names = list(names(path)) + if is_comic(file_names): + return get_metadata(stream, 'cbr') for f in file_names: stream_type = os.path.splitext(f)[1].lower() if stream_type: @@ -29,8 +34,7 @@ def get_metadata(stream): 'rb', 'imp', 'pdf', 'lrf'): data = extract_member(path, match=None, name=f)[1] stream = StringIO(data) - from calibre.ebooks.metadata.meta import get_metadata return get_metadata(stream, stream_type) - raise ValueError('No ebook found in RAR archive') - + raise ValueError('No ebook found in RAR archive') + diff --git a/src/calibre/ebooks/metadata/zip.py b/src/calibre/ebooks/metadata/zip.py index 624e0fe73c..db9d751f3a 100644 --- a/src/calibre/ebooks/metadata/zip.py +++ b/src/calibre/ebooks/metadata/zip.py @@ -8,15 +8,21 @@ from cStringIO import StringIO def get_metadata(stream): + from calibre.ebooks.metadata.meta import get_metadata + from calibre.ebooks.metadata.archive import is_comic stream_type = None zf = ZipFile(stream, 'r') - for f in zf.namelist(): + names = zf.namelist() + if is_comic(names): + # Is probably a comic + return get_metadata(stream, 'cbz') + + for f in names: stream_type = os.path.splitext(f)[1].lower() if stream_type: stream_type = stream_type[1:] if stream_type in ('lit', 'opf', 'prc', 'mobi', 'fb2', 'epub', 'rb', 'imp', 'pdf', 'lrf'): - from calibre.ebooks.metadata.meta import get_metadata stream = StringIO(zf.read(f)) return get_metadata(stream, stream_type) raise ValueError('No ebook found in ZIP archive')