From 9138e0bae5ec2af77a3e0b96b8ace4a64503a6a1 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 30 Oct 2010 06:27:17 -0600 Subject: [PATCH 1/9] Fix #7335 (elperiodico (Catalan and Spanish) updated recipes) --- resources/recipes/elperiodico_catalan.recipe | 28 ++++++++++++------ resources/recipes/elperiodico_spanish.recipe | 30 +++++++++++++------- 2 files changed, 39 insertions(+), 19 deletions(-) diff --git a/resources/recipes/elperiodico_catalan.recipe b/resources/recipes/elperiodico_catalan.recipe index e2bcb738b7..6b78f923cb 100644 --- a/resources/recipes/elperiodico_catalan.recipe +++ b/resources/recipes/elperiodico_catalan.recipe @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- __license__ = 'GPL v3' -__copyright__ = '2009, Darko Miletic ' +__copyright__ = '30 October 2010, Jordi Balcells based on an earlier recipe by Darko Miletic ' ''' elperiodico.cat ''' @@ -12,8 +12,8 @@ from calibre.ebooks.BeautifulSoup import Tag class ElPeriodico_cat(BasicNewsRecipe): title = 'El Periodico de Catalunya' - __author__ = 'Darko Miletic' - description = 'Noticias desde Catalunya' + __author__ = 'Jordi Balcells/Darko Miletic' + description = 'Noticies des de Catalunya' publisher = 'elperiodico.cat' category = 'news, politics, Spain, Catalunya' oldest_article = 2 @@ -33,15 +33,25 @@ class ElPeriodico_cat(BasicNewsRecipe): html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"' - feeds = [(u"Tota l'edició", u'http://www.elperiodico.cat/rss.asp?id=46')] + feeds = [(u'Portada', u'http://www.elperiodico.cat/ca/rss/rss_portada.xml'), + (u'Internacional', u'http://www.elperiodico.cat/ca/rss/internacional/rss.xml'), + (u'Societat', u'http://www.elperiodico.cat/ca/rss/societat/rss.xml'), + (u'Ci\xe8ncia i tecnologia', u'http://www.elperiodico.cat/ca/rss/ciencia-i-tecnologia/rss.xml'), + (u'Esports', u'http://www.elperiodico.cat/ca/rss/esports/rss.xml'), + (u'Gent', u'http://www.elperiodico.cat/ca/rss/gent/rss.xml'), + (u'Opini\xf3', u'http://www.elperiodico.cat/ca/rss/opinio/rss.xml'), + (u'Pol\xedtica', u'http://www.elperiodico.cat/ca/rss/politica/rss.xml'), + (u'Barcelona', u'http://www.elperiodico.cat/ca/rss/barcelona/rss.xml'), + (u'Economia', u'http://www.elperiodico.cat/ca/rss/economia/rss.xml'), + (u'Cultura i espectacles', u'http://www.elperiodico.cat/ca/rss/cultura-i-espectacles/rss.xml'), + (u'Tele', u'http://www.elperiodico.cat/ca/rss/tele/rss.xml')] - keep_only_tags = [dict(name='div', attrs={'id':'noticia'})] + keep_only_tags = [dict(name='div', attrs={'class':'titularnoticia'}), + dict(name='div', attrs={'class':'noticia_completa'})] - remove_tags = [ - dict(name=['object','link','script']) - ,dict(name='ul',attrs={'class':'herramientasDeNoticia'}) - ,dict(name='div', attrs={'id':'inferiores'}) + remove_tags = [dict(name='div', attrs={'class':['opcionb','opcionb last','columna_noticia']}), + dict(name='span', attrs={'class':'opcionesnoticia'}) ] def print_version(self, url): diff --git a/resources/recipes/elperiodico_spanish.recipe b/resources/recipes/elperiodico_spanish.recipe index 073863fa15..d19adc5e58 100644 --- a/resources/recipes/elperiodico_spanish.recipe +++ b/resources/recipes/elperiodico_spanish.recipe @@ -2,17 +2,17 @@ # -*- coding: utf-8 -*- __license__ = 'GPL v3' -__copyright__ = '2009, Darko Miletic ' +__copyright__ = '30 October 2010, Jordi Balcells based on an earlier recipe by Darko Miletic ' ''' -elperiodico.com +elperiodico.cat ''' from calibre.web.feeds.news import BasicNewsRecipe from calibre.ebooks.BeautifulSoup import Tag -class ElPeriodico_esp(BasicNewsRecipe): +class ElPeriodico_cat(BasicNewsRecipe): title = 'El Periodico de Catalunya' - __author__ = 'Darko Miletic' + __author__ = 'Jordi Balcells/Darko Miletic' description = 'Noticias desde Catalunya' publisher = 'elperiodico.com' category = 'news, politics, Spain, Catalunya' @@ -33,15 +33,25 @@ class ElPeriodico_esp(BasicNewsRecipe): html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"' - feeds = [(u"Toda la edición", u'http://www.elperiodico.com/rss.asp?id=46')] + feeds = [(u'Portada', u'http://www.elperiodico.com/es/rss/rss_portada.xml'), + (u'Internacional', u'http://elperiodico.com/es/rss/internacional/rss.xml'), + (u'Sociedad', u'http://elperiodico.com/es/rss/sociedad/rss.xml'), + (u'Ciencia y Tecnolog\xeda', u'http://elperiodico.com/es/rss/ciencia-y-tecnologia/rss.xml'), + (u'Deportes', u'http://elperiodico.com/es/rss/deportes/rss.xml'), + (u'Gente', u'http://elperiodico.com/es/rss/gente/rss.xml'), + (u'Opini\xf3n', u'http://elperiodico.com/es/rss/opinion/rss.xml'), + (u'Pol\xedtica', u'http://elperiodico.com/es/rss/politica/rss.xml'), + (u'Barcelona', u'http://elperiodico.com/es/rss/barcelona/rss.xml'), + (u'Econom\xeda', u'http://elperiodico.com/es/rss/economia/rss.xml'), + (u'Cultura y espect\xe1culos', u'http://elperiodico.com/es/rss/cultura-y-espectaculos/rss.xml'), + (u'Tele', u'http://elperiodico.com/es/rss/cultura-y-espectaculos/rss.xml')] - keep_only_tags = [dict(name='div', attrs={'id':'noticia'})] + keep_only_tags = [dict(name='div', attrs={'class':'titularnoticia'}), + dict(name='div', attrs={'class':'noticia_completa'})] - remove_tags = [ - dict(name=['object','link','script']) - ,dict(name='ul',attrs={'class':'herramientasDeNoticia'}) - ,dict(name='div', attrs={'id':'inferiores'}) + remove_tags = [dict(name='div', attrs={'class':['opcionb','opcionb last','columna_noticia']}), + dict(name='span', attrs={'class':'opcionesnoticia'}) ] def print_version(self, url): From 4555cee0830b1dc1727ebaa9888b45891dc6d0b3 Mon Sep 17 00:00:00 2001 From: Charles Haley <> Date: Sat, 30 Oct 2010 17:14:12 +0100 Subject: [PATCH 2/9] Add filename wildcard matching to ignore_names in check_library.py --- src/calibre/gui2/dialogs/check_library.py | 4 ++++ src/calibre/library/check_library.py | 12 +++++++++--- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/src/calibre/gui2/dialogs/check_library.py b/src/calibre/gui2/dialogs/check_library.py index 1cd11e7807..55cd91dcd3 100644 --- a/src/calibre/gui2/dialogs/check_library.py +++ b/src/calibre/gui2/dialogs/check_library.py @@ -55,12 +55,16 @@ class CheckLibraryDialog(QDialog): h.addWidget(ln) self.name_ignores = QLineEdit() self.name_ignores.setText(db.prefs.get('check_library_ignore_names', '')) + self.name_ignores.setToolTip( + _('Enter comma-separated standard file name wildcards, such as synctoy*.dat')) ln.setBuddy(self.name_ignores) h.addWidget(self.name_ignores) le = QLabel(_('Extensions to ignore')) h.addWidget(le) self.ext_ignores = QLineEdit() self.ext_ignores.setText(db.prefs.get('check_library_ignore_extensions', '')) + self.ext_ignores.setToolTip( + _('Enter comma-separated extensions without a leading dot. Used only in book folders')) le.setBuddy(self.ext_ignores) h.addWidget(self.ext_ignores) self._layout.addLayout(h) diff --git a/src/calibre/library/check_library.py b/src/calibre/library/check_library.py index 85f3d4747c..b285da0006 100644 --- a/src/calibre/library/check_library.py +++ b/src/calibre/library/check_library.py @@ -5,7 +5,7 @@ __license__ = 'GPL v3' __copyright__ = '2010, Kovid Goyal ' __docformat__ = 'restructuredtext en' -import re, os, traceback +import re, os, traceback, fnmatch from calibre import isbytestring from calibre.constants import filesystem_encoding @@ -66,13 +66,19 @@ class CheckLibrary(object): return self.failed_folders or self.mismatched_dirs or \ self.conflicting_custom_cols or self.failed_restores + def ignore_name(self, filename): + for filespec in self.ignore_names: + if fnmatch.fnmatch(filename, filespec): + return True + return False; + def scan_library(self, name_ignores, extension_ignores): self.ignore_names = frozenset(name_ignores) self.ignore_ext = frozenset(['.'+ e for e in extension_ignores]) lib = self.src_library_path for auth_dir in os.listdir(lib): - if auth_dir in self.ignore_names or auth_dir == 'metadata.db': + if self.ignore_name(auth_dir) or auth_dir == 'metadata.db': continue auth_path = os.path.join(lib, auth_dir) # First check: author must be a directory @@ -85,7 +91,7 @@ class CheckLibrary(object): # Look for titles in the author directories found_titles = False for title_dir in os.listdir(auth_path): - if title_dir in self.ignore_names: + if self.ignore_name(title_dir): continue title_path = os.path.join(auth_path, title_dir) db_path = os.path.join(auth_dir, title_dir) From e9b419521e7f49e5289d93b5a5809b5dc7e83145 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 30 Oct 2010 10:35:28 -0600 Subject: [PATCH 3/9] Fix regression taht caused original files to be deleted when adding books recursively. Also Fix #7342 (Version 0.7.25 not get metadata from file name) --- src/calibre/ebooks/html/input.py | 11 +++++++++-- src/calibre/ebooks/metadata/worker.py | 28 ++++++++++++++++++++------- 2 files changed, 30 insertions(+), 9 deletions(-) diff --git a/src/calibre/ebooks/html/input.py b/src/calibre/ebooks/html/input.py index 603adadb53..fa1de39410 100644 --- a/src/calibre/ebooks/html/input.py +++ b/src/calibre/ebooks/html/input.py @@ -282,15 +282,22 @@ class HTMLInput(InputFormatPlugin): basedir = os.getcwd() self.opts = opts + fname = None if hasattr(stream, 'name'): basedir = os.path.dirname(stream.name) + fname = os.path.basename(stream.name) if file_ext != 'opf': if opts.dont_package: raise ValueError('The --dont-package option is not supported for an HTML input file') from calibre.ebooks.metadata.html import get_metadata - oeb = self.create_oebbook(stream.name, basedir, opts, log, - get_metadata(stream)) + mi = get_metadata(stream) + if fname: + from calibre.ebooks.metadata.meta import metadata_from_filename + fmi = metadata_from_filename(fname) + fmi.smart_update(mi) + mi = fmi + oeb = self.create_oebbook(stream.name, basedir, opts, log, mi) return oeb from calibre.ebooks.conversion.plumber import create_oebbook diff --git a/src/calibre/ebooks/metadata/worker.py b/src/calibre/ebooks/metadata/worker.py index d2616c2444..247050856d 100644 --- a/src/calibre/ebooks/metadata/worker.py +++ b/src/calibre/ebooks/metadata/worker.py @@ -12,7 +12,7 @@ import os, time, sys, shutil from calibre.utils.ipc.job import ParallelJob from calibre.utils.ipc.server import Server -from calibre.ptempfile import PersistentTemporaryDirectory +from calibre.ptempfile import PersistentTemporaryDirectory, TemporaryDirectory from calibre import prints from calibre.constants import filesystem_encoding @@ -39,6 +39,10 @@ def serialize_metadata_for(formats, tdir, id_): f.write(cdata) def read_metadata_(task, tdir, notification=lambda x,y:x): + with TemporaryDirectory() as mdir: + do_read_metadata(task, tdir, mdir, notification) + +def do_read_metadata(task, tdir, mdir, notification): from calibre.customize.ui import run_plugins_on_import for x in task: try: @@ -48,17 +52,28 @@ def read_metadata_(task, tdir, notification=lambda x,y:x): try: if isinstance(formats, basestring): formats = [formats] import_map = {} - fmts = [] + fmts, metadata_fmts = [], [] for format in formats: + mfmt = format + name, ext = os.path.splitext(os.path.basename(format)) nfp = run_plugins_on_import(format) - if not nfp or not os.access(nfp, os.R_OK): - nfp = format - nfp = os.path.abspath(nfp) + if not nfp or nfp == format or not os.access(nfp, os.R_OK): + nfp = None + else: + # Ensure that the filename is preserved so that + # reading metadata from filename is not broken + nfp = os.path.abspath(nfp) + nfext = os.path.splitext(nfp)[1] + mfmt = os.path.join(mdir, name + nfext) + shutil.copyfile(nfp, mfmt) + metadata_fmts.append(mfmt) fmts.append(nfp) - serialize_metadata_for(fmts, tdir, id_) + serialize_metadata_for(metadata_fmts, tdir, id_) for format, nfp in zip(formats, fmts): + if not nfp: + continue if isinstance(nfp, unicode): nfp.encode(filesystem_encoding) x = lambda j : os.path.abspath(os.path.normpath(os.path.normcase(j))) @@ -68,7 +83,6 @@ def read_metadata_(task, tdir, notification=lambda x,y:x): dest = os.path.join(tdir, '%s.%s'%(id_, nfmt)) shutil.copyfile(nfp, dest) import_map[fmt] = dest - os.remove(nfp) if import_map: with open(os.path.join(tdir, str(id_)+'.import'), 'wb') as f: for fmt, nfp in import_map.items(): From 7a319807db3e0b4cd8cfdab7ada57f8901cfdb94 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 30 Oct 2010 10:41:22 -0600 Subject: [PATCH 4/9] Gamespot Reviews by Marc Tonsing. Fixes #7344 (New Recipe: Gamespot.com Reviews) --- resources/recipes/gamespot.recipe | 41 +++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) create mode 100644 resources/recipes/gamespot.recipe diff --git a/resources/recipes/gamespot.recipe b/resources/recipes/gamespot.recipe new file mode 100644 index 0000000000..d9a5f20c23 --- /dev/null +++ b/resources/recipes/gamespot.recipe @@ -0,0 +1,41 @@ +__license__ = 'GPL v3' +__author__ = u'Marc T\xf6nsing' + +from calibre.web.feeds.news import BasicNewsRecipe + +class GamespotCom(BasicNewsRecipe): + + title = u'Gamespot.com Reviews' + description = 'review articles from gamespot.com' + language = 'en' + __author__ = u'Marc T\xf6nsing' + + oldest_article = 7 + max_articles_per_feed = 40 + remove_empty_feeds = True + no_stylesheets = True + no_javascript = True + + feeds = [ + ('PC Reviews', 'http://www.gamespot.com/rss/game_updates.php?type=5&platform=5'), + ('XBOX 360 Reviews', 'http://www.gamespot.com/rss/game_updates.php?type=5&platform=1029'), + ('Wii Reviews', 'http://www.gamespot.com/rss/game_updates.php?type=5&platform=1031'), + ('PlayStation 3 Reviews', 'http://www.gamespot.com/rss/game_updates.php?type=5&platform=1028'), + ('PlayStation 2 Reviews', 'http://www.gamespot.com/rss/game_updates.php?type=5&platform=7'), + ('PlayStation Portable Reviews', 'http://www.gamespot.com/rss/game_updates.php?type=5&platform=1024'), + ('Nintendo DS Reviews', 'http://www.gamespot.com/rss/game_updates.php?type=5&platform=1026'), + ('iPhone Reviews', 'http://www.gamespot.com/rss/game_updates.php?type=5&platform=1049'), + ] + + remove_tags = [ + dict(name='div', attrs={'class':'top_bar'}), + dict(name='div', attrs={'class':'video_embed'}) + ] + + def get_cover_url(self): + return 'http://image.gamespotcdn.net/gamespot/shared/gs5/gslogo_bw.gif' + + def get_article_url(self, article): + return article.get('link') + '?print=1' + + From 66b9c8a9dacfcf8c9894d2aed078c72ee0bce1a7 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 30 Oct 2010 10:51:11 -0600 Subject: [PATCH 5/9] Fix #7332 (Edit metadata in bulk window doesn't resize horizontally properly) --- src/calibre/gui2/dialogs/metadata_bulk.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/calibre/gui2/dialogs/metadata_bulk.py b/src/calibre/gui2/dialogs/metadata_bulk.py index 32350c36b7..6b5ef60263 100644 --- a/src/calibre/gui2/dialogs/metadata_bulk.py +++ b/src/calibre/gui2/dialogs/metadata_bulk.py @@ -571,6 +571,10 @@ class MetadataBulkDialog(QDialog, Ui_MetadataBulkDialog): self.initalize_authors() self.initialize_series() self.initialize_publisher() + for x in ('authors', 'publisher', 'series'): + x = getattr(self, x) + x.setSizeAdjustPolicy(x.AdjustToMinimumContentsLengthWithIcon) + x.setMinimumContentsLength(25) def initalize_authors(self): all_authors = self.db.all_authors() From 4da76331d19fe4a0abeeb41c3819f1141a1ab76e Mon Sep 17 00:00:00 2001 From: Charles Haley <> Date: Sat, 30 Oct 2010 18:21:08 +0100 Subject: [PATCH 6/9] Put search/replace filter spacer back in, only this time in the scroll area --- src/calibre/gui2/dialogs/metadata_bulk.ui | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/src/calibre/gui2/dialogs/metadata_bulk.ui b/src/calibre/gui2/dialogs/metadata_bulk.ui index 0fe537b598..62a40a9676 100644 --- a/src/calibre/gui2/dialogs/metadata_bulk.ui +++ b/src/calibre/gui2/dialogs/metadata_bulk.ui @@ -678,6 +678,19 @@ nothing should be put between the original text and the inserted text + + + + Qt::Vertical + + + + 20 + 5 + + + + From a18598a7c725b7b8c2e487c4eb58b2b975e49cf4 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 30 Oct 2010 11:21:33 -0600 Subject: [PATCH 7/9] ... --- src/calibre/ebooks/metadata/amazon.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/calibre/ebooks/metadata/amazon.py b/src/calibre/ebooks/metadata/amazon.py index 10590b8cff..a8ff0f1ad0 100644 --- a/src/calibre/ebooks/metadata/amazon.py +++ b/src/calibre/ebooks/metadata/amazon.py @@ -92,10 +92,14 @@ def get_metadata(br, asin, mi): ' @class="emptyClear" or @href]'): c.getparent().remove(c) desc = html.tostring(desc, method='html', encoding=unicode).strip() - desc = re.sub(r' class=[^>]+>', '>', desc) + # remove all attributes from tags + desc = re.sub(r'<([a-zA-Z0-9]+)\s[^>]+>', r'<\1>', desc) + # Collapse whitespace desc = re.sub('\n+', '\n', desc) desc = re.sub(' +', ' ', desc) + # Remove the notice about text referring to out of print editions desc = re.sub(r'(?s)--This text ref.*?', '', desc) + # Remove comments desc = re.sub(r'(?s)', '', desc) mi.comments = desc From 1ac83da4014b7b4c88e54b905aeddcac608e9a00 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 30 Oct 2010 11:48:34 -0600 Subject: [PATCH 8/9] Only add SONY periodical code to downloaded news if output profile is set to one of the SONY reader profiles. This is needed because the ever delightful Stanza crashes and burns when an EPUB has the periodical code --- src/calibre/customize/profiles.py | 8 ++++++++ src/calibre/ebooks/epub/output.py | 7 ++++--- src/calibre/manual/plugins.rst | 3 ++- 3 files changed, 14 insertions(+), 4 deletions(-) diff --git a/src/calibre/customize/profiles.py b/src/calibre/customize/profiles.py index f8fe2a59b4..38f6d401f6 100644 --- a/src/calibre/customize/profiles.py +++ b/src/calibre/customize/profiles.py @@ -259,6 +259,9 @@ class OutputProfile(Plugin): #: Number of ems that the left margin of a blockquote is rendered as mobi_ems_per_blockquote = 1.0 + #: Special periodical formatting needed in EPUB + epub_periodical_format = None + @classmethod def tags_to_string(cls, tags): return escape(', '.join(tags)) @@ -439,6 +442,9 @@ class SonyReaderOutput(OutputProfile): fsizes = [7.5, 9, 10, 12, 15.5, 20, 22, 24] unsupported_unicode_chars = [u'\u201f', u'\u201b'] + epub_periodical_format = 'sony' + #periodical_date_in_title = False + class KoboReaderOutput(OutputProfile): @@ -561,6 +567,8 @@ class CybookOpusOutput(SonyReaderOutput): fbase = 16 fsizes = [12, 12, 14, 16, 18, 20, 22, 24] + epub_periodical_format = None + class KindleOutput(OutputProfile): name = 'Kindle' diff --git a/src/calibre/ebooks/epub/output.py b/src/calibre/ebooks/epub/output.py index 38820010a8..952559a9e2 100644 --- a/src/calibre/ebooks/epub/output.py +++ b/src/calibre/ebooks/epub/output.py @@ -187,9 +187,10 @@ class EPUBOutput(OutputFormatPlugin): metadata_xml = None extra_entries = [] if self.is_periodical: - from calibre.ebooks.epub.periodical import sony_metadata - metadata_xml, atom_xml = sony_metadata(oeb) - extra_entries = [('atom.xml', 'application/atom+xml', atom_xml)] + if self.opts.output_profile.epub_periodical_format == 'sony': + from calibre.ebooks.epub.periodical import sony_metadata + metadata_xml, atom_xml = sony_metadata(oeb) + extra_entries = [('atom.xml', 'application/atom+xml', atom_xml)] oeb_output = plugin_for_output_format('oeb') oeb_output.convert(oeb, tdir, input_plugin, opts, log) opf = [x for x in os.listdir(tdir) if x.endswith('.opf')][0] diff --git a/src/calibre/manual/plugins.rst b/src/calibre/manual/plugins.rst index 1b9b47ed3d..eb955aebee 100644 --- a/src/calibre/manual/plugins.rst +++ b/src/calibre/manual/plugins.rst @@ -36,6 +36,7 @@ FileTypePlugin .. _pluginsMetadataPlugin: + Metadata plugins ------------------- @@ -50,7 +51,6 @@ Metadata plugins :members: :member-order: bysource -.. _pluginsMetadataSource: Catalog plugins ---------------- @@ -60,6 +60,7 @@ Catalog plugins :members: :member-order: bysource +.. _pluginsMetadataSource: Metadata download plugins -------------------------- From 7d7757ab93fff88bfc4aeddd25da2498adf7146d Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 30 Oct 2010 12:15:00 -0600 Subject: [PATCH 9/9] Fix #7210 (Download News) --- resources/recipes/nzherald.recipe | 93 +++++++++++-------------------- src/calibre/web/feeds/news.py | 3 + 2 files changed, 34 insertions(+), 62 deletions(-) diff --git a/resources/recipes/nzherald.recipe b/resources/recipes/nzherald.recipe index 3ac1e27c20..b73fd8366e 100644 --- a/resources/recipes/nzherald.recipe +++ b/resources/recipes/nzherald.recipe @@ -1,74 +1,43 @@ from calibre.web.feeds.recipes import BasicNewsRecipe +import re class NewZealandHerald(BasicNewsRecipe): title = 'New Zealand Herald' - __author__ = 'Krittika Goyal' + __author__ = 'Kovid Goyal' description = 'Daily news' timefmt = ' [%d %b, %Y]' language = 'en_NZ' + oldest_article = 2.5 - no_stylesheets = True - remove_tags_before = dict(name='div', attrs={'class':'contentContainer left eight'}) - remove_tags_after = dict(name='div', attrs={'class':'callToAction'}) - remove_tags = [ - dict(name='iframe'), - dict(name='div', attrs={'class':['sectionHeader', 'tools','callToAction', 'contentContainer right two nopad relatedColumn']}), - #dict(name='div', attrs={'id':['shareContainer']}), - #dict(name='form', attrs={'onsubmit':"return verifySearch(this.w,'Keyword, citation, or #author')"}), - #dict(name='table', attrs={'cellspacing':'0'}), + feeds = [ + ('Business', + 'http://rss.nzherald.co.nz/rss/xml/nzhrsscid_000000003.xml'), + ('World', + 'http://rss.nzherald.co.nz/rss/xml/nzhrsscid_000000002.xml'), + ('National', + 'http://rss.nzherald.co.nz/rss/xml/nzhrsscid_000000001.xml'), + ('Entertainment', + 'http://rss.nzherald.co.nz/rss/xml/nzhrsscid_001501119.xml'), + ('Travel', + 'http://rss.nzherald.co.nz/rss/xml/nzhrsscid_000000007.xml'), + ('Opinion', + 'http://rss.nzherald.co.nz/rss/xml/nzhrsscid_000000466.xml'), + ('Life & Style', + 'http://rss.nzherald.co.nz/rss/xml/nzhrsscid_000000006.xml'), + ('Technology' + 'http://rss.nzherald.co.nz/rss/xml/nzhrsscid_000000005.xml'), + ('Sport', + 'http://rss.nzherald.co.nz/rss/xml/nzhrsscid_000000004.xml'), + ('Motoring', + 'http://rss.nzherald.co.nz/rss/xml/nzhrsscid_000000009.xml'), + ('Property', + 'http://rss.nzherald.co.nz/rss/xml/nzhrsscid_000000008.xml'), ] - def preprocess_html(self, soup): - table = soup.find('table') - if table is not None: - table.extract() - return soup - - #TO GET ARTICLES IN SECTION - def nz_parse_section(self, url): - soup = self.index_to_soup(url) - div = soup.find(attrs={'class':'col-300 categoryList'}) - date = div.find(attrs={'class':'link-list-heading'}) - - current_articles = [] - for x in date.findAllNext(attrs={'class':['linkList', 'link-list-heading']}): - if x.get('class') == 'link-list-heading': break - for li in x.findAll('li'): - a = li.find('a', href=True) - if a is None: - continue - title = self.tag_to_string(a) - url = a.get('href', False) - if not url or not title: - continue - if url.startswith('/'): - url = 'http://www.nzherald.co.nz'+url - self.log('\t\tFound article:', title) - self.log('\t\t\t', url) - current_articles.append({'title': title, 'url':url, - 'description':'', 'date':''}) - - return current_articles - - - # To GET SECTIONS - def parse_index(self): - feeds = [] - for title, url in [ - ('National', - 'http://www.nzherald.co.nz/nz/news/headlines.cfm?c_id=1'), - ('World', - 'http://www.nzherald.co.nz/world/news/headlines.cfm?c_id=2'), - ('Politics', - 'http://www.nzherald.co.nz/politics/news/headlines.cfm?c_id=280'), - ('Crime', - 'http://www.nzherald.co.nz/crime/news/headlines.cfm?c_id=30'), - ('Environment', - 'http://www.nzherald.co.nz/environment/news/headlines.cfm?c_id=39'), - ]: - articles = self.nz_parse_section(url) - if articles: - feeds.append((title, articles)) - return feeds + def print_version(self, url): + m = re.search(r'objectid=(\d+)', url) + if m is None: + return url + return 'http://www.nzherald.co.nz/news/print.cfm?pnum=1&objectid=' + m.group(1) diff --git a/src/calibre/web/feeds/news.py b/src/calibre/web/feeds/news.py index 168ed6c5ab..e081dc678e 100644 --- a/src/calibre/web/feeds/news.py +++ b/src/calibre/web/feeds/news.py @@ -957,6 +957,8 @@ class BasicNewsRecipe(Recipe): self.log.error(_('Could not download cover: %s')%str(err)) self.log.debug(traceback.format_exc()) else: + if not cu: + return cdata = None if os.access(cu, os.R_OK): cdata = open(cu, 'rb').read() @@ -987,6 +989,7 @@ class BasicNewsRecipe(Recipe): self.cover_path = cpath def download_cover(self): + self.cover_path = None try: self._download_cover() except: