Merge from trunk

This commit is contained in:
Sengian 2010-10-30 20:23:08 +02:00
commit b7109b300f
15 changed files with 193 additions and 98 deletions

View File

@ -2,7 +2,7 @@
# -*- coding: utf-8 -*-
__license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
__copyright__ = '30 October 2010, Jordi Balcells based on an earlier recipe by Darko Miletic <darko.miletic at gmail.com>'
'''
elperiodico.cat
'''
@ -12,8 +12,8 @@ from calibre.ebooks.BeautifulSoup import Tag
class ElPeriodico_cat(BasicNewsRecipe):
title = 'El Periodico de Catalunya'
__author__ = 'Darko Miletic'
description = 'Noticias desde Catalunya'
__author__ = 'Jordi Balcells/Darko Miletic'
description = 'Noticies des de Catalunya'
publisher = 'elperiodico.cat'
category = 'news, politics, Spain, Catalunya'
oldest_article = 2
@ -33,15 +33,25 @@ class ElPeriodico_cat(BasicNewsRecipe):
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
feeds = [(u"Tota l'edició", u'http://www.elperiodico.cat/rss.asp?id=46')]
feeds = [(u'Portada', u'http://www.elperiodico.cat/ca/rss/rss_portada.xml'),
(u'Internacional', u'http://www.elperiodico.cat/ca/rss/internacional/rss.xml'),
(u'Societat', u'http://www.elperiodico.cat/ca/rss/societat/rss.xml'),
(u'Ci\xe8ncia i tecnologia', u'http://www.elperiodico.cat/ca/rss/ciencia-i-tecnologia/rss.xml'),
(u'Esports', u'http://www.elperiodico.cat/ca/rss/esports/rss.xml'),
(u'Gent', u'http://www.elperiodico.cat/ca/rss/gent/rss.xml'),
(u'Opini\xf3', u'http://www.elperiodico.cat/ca/rss/opinio/rss.xml'),
(u'Pol\xedtica', u'http://www.elperiodico.cat/ca/rss/politica/rss.xml'),
(u'Barcelona', u'http://www.elperiodico.cat/ca/rss/barcelona/rss.xml'),
(u'Economia', u'http://www.elperiodico.cat/ca/rss/economia/rss.xml'),
(u'Cultura i espectacles', u'http://www.elperiodico.cat/ca/rss/cultura-i-espectacles/rss.xml'),
(u'Tele', u'http://www.elperiodico.cat/ca/rss/tele/rss.xml')]
keep_only_tags = [dict(name='div', attrs={'id':'noticia'})]
keep_only_tags = [dict(name='div', attrs={'class':'titularnoticia'}),
dict(name='div', attrs={'class':'noticia_completa'})]
remove_tags = [
dict(name=['object','link','script'])
,dict(name='ul',attrs={'class':'herramientasDeNoticia'})
,dict(name='div', attrs={'id':'inferiores'})
remove_tags = [dict(name='div', attrs={'class':['opcionb','opcionb last','columna_noticia']}),
dict(name='span', attrs={'class':'opcionesnoticia'})
]
def print_version(self, url):

View File

@ -2,17 +2,17 @@
# -*- coding: utf-8 -*-
__license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
__copyright__ = '30 October 2010, Jordi Balcells based on an earlier recipe by Darko Miletic <darko.miletic at gmail.com>'
'''
elperiodico.com
elperiodico.cat
'''
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import Tag
class ElPeriodico_esp(BasicNewsRecipe):
class ElPeriodico_cat(BasicNewsRecipe):
title = 'El Periodico de Catalunya'
__author__ = 'Darko Miletic'
__author__ = 'Jordi Balcells/Darko Miletic'
description = 'Noticias desde Catalunya'
publisher = 'elperiodico.com'
category = 'news, politics, Spain, Catalunya'
@ -33,15 +33,25 @@ class ElPeriodico_esp(BasicNewsRecipe):
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
feeds = [(u"Toda la edición", u'http://www.elperiodico.com/rss.asp?id=46')]
feeds = [(u'Portada', u'http://www.elperiodico.com/es/rss/rss_portada.xml'),
(u'Internacional', u'http://elperiodico.com/es/rss/internacional/rss.xml'),
(u'Sociedad', u'http://elperiodico.com/es/rss/sociedad/rss.xml'),
(u'Ciencia y Tecnolog\xeda', u'http://elperiodico.com/es/rss/ciencia-y-tecnologia/rss.xml'),
(u'Deportes', u'http://elperiodico.com/es/rss/deportes/rss.xml'),
(u'Gente', u'http://elperiodico.com/es/rss/gente/rss.xml'),
(u'Opini\xf3n', u'http://elperiodico.com/es/rss/opinion/rss.xml'),
(u'Pol\xedtica', u'http://elperiodico.com/es/rss/politica/rss.xml'),
(u'Barcelona', u'http://elperiodico.com/es/rss/barcelona/rss.xml'),
(u'Econom\xeda', u'http://elperiodico.com/es/rss/economia/rss.xml'),
(u'Cultura y espect\xe1culos', u'http://elperiodico.com/es/rss/cultura-y-espectaculos/rss.xml'),
(u'Tele', u'http://elperiodico.com/es/rss/cultura-y-espectaculos/rss.xml')]
keep_only_tags = [dict(name='div', attrs={'id':'noticia'})]
keep_only_tags = [dict(name='div', attrs={'class':'titularnoticia'}),
dict(name='div', attrs={'class':'noticia_completa'})]
remove_tags = [
dict(name=['object','link','script'])
,dict(name='ul',attrs={'class':'herramientasDeNoticia'})
,dict(name='div', attrs={'id':'inferiores'})
remove_tags = [dict(name='div', attrs={'class':['opcionb','opcionb last','columna_noticia']}),
dict(name='span', attrs={'class':'opcionesnoticia'})
]
def print_version(self, url):

View File

@ -0,0 +1,41 @@
__license__ = 'GPL v3'
__author__ = u'Marc T\xf6nsing'
from calibre.web.feeds.news import BasicNewsRecipe
class GamespotCom(BasicNewsRecipe):
title = u'Gamespot.com Reviews'
description = 'review articles from gamespot.com'
language = 'en'
__author__ = u'Marc T\xf6nsing'
oldest_article = 7
max_articles_per_feed = 40
remove_empty_feeds = True
no_stylesheets = True
no_javascript = True
feeds = [
('PC Reviews', 'http://www.gamespot.com/rss/game_updates.php?type=5&platform=5'),
('XBOX 360 Reviews', 'http://www.gamespot.com/rss/game_updates.php?type=5&platform=1029'),
('Wii Reviews', 'http://www.gamespot.com/rss/game_updates.php?type=5&platform=1031'),
('PlayStation 3 Reviews', 'http://www.gamespot.com/rss/game_updates.php?type=5&platform=1028'),
('PlayStation 2 Reviews', 'http://www.gamespot.com/rss/game_updates.php?type=5&platform=7'),
('PlayStation Portable Reviews', 'http://www.gamespot.com/rss/game_updates.php?type=5&platform=1024'),
('Nintendo DS Reviews', 'http://www.gamespot.com/rss/game_updates.php?type=5&platform=1026'),
('iPhone Reviews', 'http://www.gamespot.com/rss/game_updates.php?type=5&platform=1049'),
]
remove_tags = [
dict(name='div', attrs={'class':'top_bar'}),
dict(name='div', attrs={'class':'video_embed'})
]
def get_cover_url(self):
return 'http://image.gamespotcdn.net/gamespot/shared/gs5/gslogo_bw.gif'
def get_article_url(self, article):
return article.get('link') + '?print=1'

View File

@ -1,74 +1,43 @@
from calibre.web.feeds.recipes import BasicNewsRecipe
import re
class NewZealandHerald(BasicNewsRecipe):
title = 'New Zealand Herald'
__author__ = 'Krittika Goyal'
__author__ = 'Kovid Goyal'
description = 'Daily news'
timefmt = ' [%d %b, %Y]'
language = 'en_NZ'
oldest_article = 2.5
no_stylesheets = True
remove_tags_before = dict(name='div', attrs={'class':'contentContainer left eight'})
remove_tags_after = dict(name='div', attrs={'class':'callToAction'})
remove_tags = [
dict(name='iframe'),
dict(name='div', attrs={'class':['sectionHeader', 'tools','callToAction', 'contentContainer right two nopad relatedColumn']}),
#dict(name='div', attrs={'id':['shareContainer']}),
#dict(name='form', attrs={'onsubmit':"return verifySearch(this.w,'Keyword, citation, or #author')"}),
#dict(name='table', attrs={'cellspacing':'0'}),
feeds = [
('Business',
'http://rss.nzherald.co.nz/rss/xml/nzhrsscid_000000003.xml'),
('World',
'http://rss.nzherald.co.nz/rss/xml/nzhrsscid_000000002.xml'),
('National',
'http://rss.nzherald.co.nz/rss/xml/nzhrsscid_000000001.xml'),
('Entertainment',
'http://rss.nzherald.co.nz/rss/xml/nzhrsscid_001501119.xml'),
('Travel',
'http://rss.nzherald.co.nz/rss/xml/nzhrsscid_000000007.xml'),
('Opinion',
'http://rss.nzherald.co.nz/rss/xml/nzhrsscid_000000466.xml'),
('Life & Style',
'http://rss.nzherald.co.nz/rss/xml/nzhrsscid_000000006.xml'),
('Technology'
'http://rss.nzherald.co.nz/rss/xml/nzhrsscid_000000005.xml'),
('Sport',
'http://rss.nzherald.co.nz/rss/xml/nzhrsscid_000000004.xml'),
('Motoring',
'http://rss.nzherald.co.nz/rss/xml/nzhrsscid_000000009.xml'),
('Property',
'http://rss.nzherald.co.nz/rss/xml/nzhrsscid_000000008.xml'),
]
def preprocess_html(self, soup):
table = soup.find('table')
if table is not None:
table.extract()
return soup
#TO GET ARTICLES IN SECTION
def nz_parse_section(self, url):
soup = self.index_to_soup(url)
div = soup.find(attrs={'class':'col-300 categoryList'})
date = div.find(attrs={'class':'link-list-heading'})
current_articles = []
for x in date.findAllNext(attrs={'class':['linkList', 'link-list-heading']}):
if x.get('class') == 'link-list-heading': break
for li in x.findAll('li'):
a = li.find('a', href=True)
if a is None:
continue
title = self.tag_to_string(a)
url = a.get('href', False)
if not url or not title:
continue
if url.startswith('/'):
url = 'http://www.nzherald.co.nz'+url
self.log('\t\tFound article:', title)
self.log('\t\t\t', url)
current_articles.append({'title': title, 'url':url,
'description':'', 'date':''})
return current_articles
# To GET SECTIONS
def parse_index(self):
feeds = []
for title, url in [
('National',
'http://www.nzherald.co.nz/nz/news/headlines.cfm?c_id=1'),
('World',
'http://www.nzherald.co.nz/world/news/headlines.cfm?c_id=2'),
('Politics',
'http://www.nzherald.co.nz/politics/news/headlines.cfm?c_id=280'),
('Crime',
'http://www.nzherald.co.nz/crime/news/headlines.cfm?c_id=30'),
('Environment',
'http://www.nzherald.co.nz/environment/news/headlines.cfm?c_id=39'),
]:
articles = self.nz_parse_section(url)
if articles:
feeds.append((title, articles))
return feeds
def print_version(self, url):
m = re.search(r'objectid=(\d+)', url)
if m is None:
return url
return 'http://www.nzherald.co.nz/news/print.cfm?pnum=1&objectid=' + m.group(1)

View File

@ -259,6 +259,9 @@ class OutputProfile(Plugin):
#: Number of ems that the left margin of a blockquote is rendered as
mobi_ems_per_blockquote = 1.0
#: Special periodical formatting needed in EPUB
epub_periodical_format = None
@classmethod
def tags_to_string(cls, tags):
return escape(', '.join(tags))
@ -439,6 +442,9 @@ class SonyReaderOutput(OutputProfile):
fsizes = [7.5, 9, 10, 12, 15.5, 20, 22, 24]
unsupported_unicode_chars = [u'\u201f', u'\u201b']
epub_periodical_format = 'sony'
#periodical_date_in_title = False
class KoboReaderOutput(OutputProfile):
@ -561,6 +567,8 @@ class CybookOpusOutput(SonyReaderOutput):
fbase = 16
fsizes = [12, 12, 14, 16, 18, 20, 22, 24]
epub_periodical_format = None
class KindleOutput(OutputProfile):
name = 'Kindle'

View File

@ -187,9 +187,10 @@ class EPUBOutput(OutputFormatPlugin):
metadata_xml = None
extra_entries = []
if self.is_periodical:
from calibre.ebooks.epub.periodical import sony_metadata
metadata_xml, atom_xml = sony_metadata(oeb)
extra_entries = [('atom.xml', 'application/atom+xml', atom_xml)]
if self.opts.output_profile.epub_periodical_format == 'sony':
from calibre.ebooks.epub.periodical import sony_metadata
metadata_xml, atom_xml = sony_metadata(oeb)
extra_entries = [('atom.xml', 'application/atom+xml', atom_xml)]
oeb_output = plugin_for_output_format('oeb')
oeb_output.convert(oeb, tdir, input_plugin, opts, log)
opf = [x for x in os.listdir(tdir) if x.endswith('.opf')][0]

View File

@ -282,15 +282,22 @@ class HTMLInput(InputFormatPlugin):
basedir = os.getcwd()
self.opts = opts
fname = None
if hasattr(stream, 'name'):
basedir = os.path.dirname(stream.name)
fname = os.path.basename(stream.name)
if file_ext != 'opf':
if opts.dont_package:
raise ValueError('The --dont-package option is not supported for an HTML input file')
from calibre.ebooks.metadata.html import get_metadata
oeb = self.create_oebbook(stream.name, basedir, opts, log,
get_metadata(stream))
mi = get_metadata(stream)
if fname:
from calibre.ebooks.metadata.meta import metadata_from_filename
fmi = metadata_from_filename(fname)
fmi.smart_update(mi)
mi = fmi
oeb = self.create_oebbook(stream.name, basedir, opts, log, mi)
return oeb
from calibre.ebooks.conversion.plumber import create_oebbook

View File

@ -92,10 +92,14 @@ def get_metadata(br, asin, mi):
' @class="emptyClear" or @href]'):
c.getparent().remove(c)
desc = html.tostring(desc, method='html', encoding=unicode).strip()
desc = re.sub(r' class=[^>]+>', '>', desc)
# remove all attributes from tags
desc = re.sub(r'<([a-zA-Z0-9]+)\s[^>]+>', r'<\1>', desc)
# Collapse whitespace
desc = re.sub('\n+', '\n', desc)
desc = re.sub(' +', ' ', desc)
# Remove the notice about text referring to out of print editions
desc = re.sub(r'(?s)<em>--This text ref.*?</em>', '', desc)
# Remove comments
desc = re.sub(r'(?s)<!--.*?-->', '', desc)
mi.comments = desc

View File

@ -12,7 +12,7 @@ import os, time, sys, shutil
from calibre.utils.ipc.job import ParallelJob
from calibre.utils.ipc.server import Server
from calibre.ptempfile import PersistentTemporaryDirectory
from calibre.ptempfile import PersistentTemporaryDirectory, TemporaryDirectory
from calibre import prints
from calibre.constants import filesystem_encoding
@ -39,6 +39,10 @@ def serialize_metadata_for(formats, tdir, id_):
f.write(cdata)
def read_metadata_(task, tdir, notification=lambda x,y:x):
with TemporaryDirectory() as mdir:
do_read_metadata(task, tdir, mdir, notification)
def do_read_metadata(task, tdir, mdir, notification):
from calibre.customize.ui import run_plugins_on_import
for x in task:
try:
@ -48,17 +52,28 @@ def read_metadata_(task, tdir, notification=lambda x,y:x):
try:
if isinstance(formats, basestring): formats = [formats]
import_map = {}
fmts = []
fmts, metadata_fmts = [], []
for format in formats:
mfmt = format
name, ext = os.path.splitext(os.path.basename(format))
nfp = run_plugins_on_import(format)
if not nfp or not os.access(nfp, os.R_OK):
nfp = format
nfp = os.path.abspath(nfp)
if not nfp or nfp == format or not os.access(nfp, os.R_OK):
nfp = None
else:
# Ensure that the filename is preserved so that
# reading metadata from filename is not broken
nfp = os.path.abspath(nfp)
nfext = os.path.splitext(nfp)[1]
mfmt = os.path.join(mdir, name + nfext)
shutil.copyfile(nfp, mfmt)
metadata_fmts.append(mfmt)
fmts.append(nfp)
serialize_metadata_for(fmts, tdir, id_)
serialize_metadata_for(metadata_fmts, tdir, id_)
for format, nfp in zip(formats, fmts):
if not nfp:
continue
if isinstance(nfp, unicode):
nfp.encode(filesystem_encoding)
x = lambda j : os.path.abspath(os.path.normpath(os.path.normcase(j)))
@ -68,7 +83,6 @@ def read_metadata_(task, tdir, notification=lambda x,y:x):
dest = os.path.join(tdir, '%s.%s'%(id_, nfmt))
shutil.copyfile(nfp, dest)
import_map[fmt] = dest
os.remove(nfp)
if import_map:
with open(os.path.join(tdir, str(id_)+'.import'), 'wb') as f:
for fmt, nfp in import_map.items():

View File

@ -55,12 +55,16 @@ class CheckLibraryDialog(QDialog):
h.addWidget(ln)
self.name_ignores = QLineEdit()
self.name_ignores.setText(db.prefs.get('check_library_ignore_names', ''))
self.name_ignores.setToolTip(
_('Enter comma-separated standard file name wildcards, such as synctoy*.dat'))
ln.setBuddy(self.name_ignores)
h.addWidget(self.name_ignores)
le = QLabel(_('Extensions to ignore'))
h.addWidget(le)
self.ext_ignores = QLineEdit()
self.ext_ignores.setText(db.prefs.get('check_library_ignore_extensions', ''))
self.ext_ignores.setToolTip(
_('Enter comma-separated extensions without a leading dot. Used only in book folders'))
le.setBuddy(self.ext_ignores)
h.addWidget(self.ext_ignores)
self._layout.addLayout(h)

View File

@ -571,6 +571,10 @@ class MetadataBulkDialog(QDialog, Ui_MetadataBulkDialog):
self.initalize_authors()
self.initialize_series()
self.initialize_publisher()
for x in ('authors', 'publisher', 'series'):
x = getattr(self, x)
x.setSizeAdjustPolicy(x.AdjustToMinimumContentsLengthWithIcon)
x.setMinimumContentsLength(25)
def initalize_authors(self):
all_authors = self.db.all_authors()

View File

@ -678,6 +678,19 @@ nothing should be put between the original text and the inserted text</string>
<item row="8" column="2">
<widget class="QLineEdit" name="test_result"/>
</item>
<item row="25" column="0" colspan="2">
<spacer name="verticalSpacer_2">
<property name="orientation">
<enum>Qt::Vertical</enum>
</property>
<property name="sizeHint" stdset="0">
<size>
<width>20</width>
<height>5</height>
</size>
</property>
</spacer>
</item>
</layout>
</widget>
</widget>

View File

@ -5,7 +5,7 @@ __license__ = 'GPL v3'
__copyright__ = '2010, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import re, os, traceback
import re, os, traceback, fnmatch
from calibre import isbytestring
from calibre.constants import filesystem_encoding
@ -66,13 +66,19 @@ class CheckLibrary(object):
return self.failed_folders or self.mismatched_dirs or \
self.conflicting_custom_cols or self.failed_restores
def ignore_name(self, filename):
for filespec in self.ignore_names:
if fnmatch.fnmatch(filename, filespec):
return True
return False;
def scan_library(self, name_ignores, extension_ignores):
self.ignore_names = frozenset(name_ignores)
self.ignore_ext = frozenset(['.'+ e for e in extension_ignores])
lib = self.src_library_path
for auth_dir in os.listdir(lib):
if auth_dir in self.ignore_names or auth_dir == 'metadata.db':
if self.ignore_name(auth_dir) or auth_dir == 'metadata.db':
continue
auth_path = os.path.join(lib, auth_dir)
# First check: author must be a directory
@ -85,7 +91,7 @@ class CheckLibrary(object):
# Look for titles in the author directories
found_titles = False
for title_dir in os.listdir(auth_path):
if title_dir in self.ignore_names:
if self.ignore_name(title_dir):
continue
title_path = os.path.join(auth_path, title_dir)
db_path = os.path.join(auth_dir, title_dir)

View File

@ -36,6 +36,7 @@ FileTypePlugin
.. _pluginsMetadataPlugin:
Metadata plugins
-------------------
@ -50,7 +51,6 @@ Metadata plugins
:members:
:member-order: bysource
.. _pluginsMetadataSource:
Catalog plugins
----------------
@ -60,6 +60,7 @@ Catalog plugins
:members:
:member-order: bysource
.. _pluginsMetadataSource:
Metadata download plugins
--------------------------

View File

@ -957,6 +957,8 @@ class BasicNewsRecipe(Recipe):
self.log.error(_('Could not download cover: %s')%str(err))
self.log.debug(traceback.format_exc())
else:
if not cu:
return
cdata = None
if os.access(cu, os.R_OK):
cdata = open(cu, 'rb').read()
@ -987,6 +989,7 @@ class BasicNewsRecipe(Recipe):
self.cover_path = cpath
def download_cover(self):
self.cover_path = None
try:
self._download_cover()
except: