Sync to trunk.

This commit is contained in:
John Schember 2011-07-03 09:45:48 -04:00
commit 0774cec024
4 changed files with 342 additions and 37 deletions

View File

@ -1,7 +1,7 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2008 - 2009, Darko Miletic <darko.miletic at gmail.com>'
__copyright__ = 'Copyright 2011 Starson17'
'''
engadget.com
'''
@ -9,14 +9,29 @@ engadget.com
from calibre.web.feeds.news import BasicNewsRecipe
class Engadget(BasicNewsRecipe):
title = u'Engadget'
__author__ = 'Darko Miletic'
title = u'Engadget_Full'
__author__ = 'Starson17'
__version__ = 'v1.00'
__date__ = '02, July 2011'
description = 'Tech news'
language = 'en'
oldest_article = 7
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = True
use_embedded_content = False
remove_javascript = True
remove_empty_feeds = True
feeds = [ (u'Posts', u'http://www.engadget.com/rss.xml')]
keep_only_tags = [dict(name='div', attrs={'class':['post_content permalink ','post_content permalink alt-post-full']})]
remove_tags = [dict(name='div', attrs={'class':['filed_under','post_footer']})]
remove_tags_after = [dict(name='div', attrs={'class':['post_footer']})]
feeds = [(u'Posts', u'http://www.engadget.com/rss.xml')]
extra_css = '''
h1{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:large;}
h2{font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:small;}
p{font-family:Arial,Helvetica,sans-serif;font-size:small;}
body{font-family:Helvetica,Arial,sans-serif;font-size:small;}
'''

80
recipes/scmp.recipe Normal file
View File

@ -0,0 +1,80 @@
__license__ = 'GPL v3'
__copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>'
'''
scmp.com
'''
import re
from calibre.web.feeds.news import BasicNewsRecipe
class SCMP(BasicNewsRecipe):
title = 'South China Morning Post'
__author__ = 'llam'
description = "SCMP.com, Hong Kong's premier online English daily provides exclusive up-to-date news, audio video news, podcasts, RSS Feeds, Blogs, breaking news, top stories, award winning news and analysis on Hong Kong and China."
publisher = 'South China Morning Post Publishers Ltd.'
category = 'SCMP, Online news, Hong Kong News, China news, Business news, English newspaper, daily newspaper, Lifestyle news, Sport news, Audio Video news, Asia news, World news, economy news, investor relations news, RSS Feeds'
oldest_article = 2
delay = 1
max_articles_per_feed = 200
no_stylesheets = True
encoding = 'utf-8'
use_embedded_content = False
language = 'en_CN'
remove_empty_feeds = True
needs_subscription = True
publication_type = 'newspaper'
masthead_url = 'http://www.scmp.com/images/logo_scmp_home.gif'
extra_css = ' body{font-family: Arial,Helvetica,sans-serif } '
conversion_options = {
'comment' : description
, 'tags' : category
, 'publisher' : publisher
, 'language' : language
}
def get_browser(self):
br = BasicNewsRecipe.get_browser()
#br.set_debug_http(True)
#br.set_debug_responses(True)
#br.set_debug_redirects(True)
if self.username is not None and self.password is not None:
br.open('http://www.scmp.com/portal/site/SCMP/')
br.select_form(name='loginForm')
br['Login' ] = self.username
br['Password'] = self.password
br.submit()
return br
remove_attributes=['width','height','border']
keep_only_tags = [
dict(attrs={'id':['ART','photoBox']})
,dict(attrs={'class':['article_label','article_byline','article_body']})
]
preprocess_regexps = [
(re.compile(r'<P><table((?!<table).)*class="embscreen"((?!</table>).)*</table>', re.DOTALL|re.IGNORECASE),
lambda match: ''),
]
feeds = [
(u'Business' , u'http://www.scmp.com/rss/business.xml' )
,(u'Hong Kong' , u'http://www.scmp.com/rss/hong_kong.xml' )
,(u'China' , u'http://www.scmp.com/rss/china.xml' )
,(u'Asia & World' , u'http://www.scmp.com/rss/news_asia_world.xml')
,(u'Opinion' , u'http://www.scmp.com/rss/opinion.xml' )
,(u'LifeSTYLE' , u'http://www.scmp.com/rss/lifestyle.xml' )
,(u'Sport' , u'http://www.scmp.com/rss/sport.xml' )
]
def print_version(self, url):
rpart, sep, rest = url.rpartition('&')
return rpart #+ sep + urllib.quote_plus(rest)
def preprocess_html(self, soup):
for item in soup.findAll(style=True):
del item['style']
items = soup.findAll(src="/images/label_icon.gif")
[item.extract() for item in items]
return self.adeify_images(soup)

View File

@ -22,7 +22,7 @@ from calibre.library.field_metadata import FieldMetadata
from calibre.ebooks.metadata import title_sort, author_to_author_sort
from calibre.utils.icu import strcmp
from calibre.utils.config import to_json, from_json, prefs, tweaks
from calibre.utils.date import utcfromtimestamp
from calibre.utils.date import utcfromtimestamp, parse_date
from calibre.db.tables import (OneToOneTable, ManyToOneTable, ManyToManyTable,
SizeTable, FormatsTable, AuthorsTable, IdentifiersTable)
# }}}
@ -248,7 +248,10 @@ class DB(object, SchemaUpgrade):
UPDATE authors SET sort=author_to_author_sort(name) WHERE sort IS NULL;
''')
def initialize_prefs(self, default_prefs):
self.initialize_custom_columns()
self.initialize_tables()
def initialize_prefs(self, default_prefs): # {{{
self.prefs = DBPrefs(self)
if default_prefs is not None and not self._exists:
@ -339,6 +342,222 @@ class DB(object, SchemaUpgrade):
cats_changed = True
if cats_changed:
self.prefs.set('user_categories', user_cats)
# }}}
def initialize_custom_columns(self): # {{{
with self.conn:
# Delete previously marked custom columns
for record in self.conn.get(
'SELECT id FROM custom_columns WHERE mark_for_delete=1'):
num = record[0]
table, lt = self.custom_table_names(num)
self.conn.execute('''\
DROP INDEX IF EXISTS {table}_idx;
DROP INDEX IF EXISTS {lt}_aidx;
DROP INDEX IF EXISTS {lt}_bidx;
DROP TRIGGER IF EXISTS fkc_update_{lt}_a;
DROP TRIGGER IF EXISTS fkc_update_{lt}_b;
DROP TRIGGER IF EXISTS fkc_insert_{lt};
DROP TRIGGER IF EXISTS fkc_delete_{lt};
DROP TRIGGER IF EXISTS fkc_insert_{table};
DROP TRIGGER IF EXISTS fkc_delete_{table};
DROP VIEW IF EXISTS tag_browser_{table};
DROP VIEW IF EXISTS tag_browser_filtered_{table};
DROP TABLE IF EXISTS {table};
DROP TABLE IF EXISTS {lt};
'''.format(table=table, lt=lt)
)
self.conn.execute('DELETE FROM custom_columns WHERE mark_for_delete=1')
# Load metadata for custom columns
self.custom_column_label_map, self.custom_column_num_map = {}, {}
triggers = []
remove = []
custom_tables = self.custom_tables
for record in self.conn.get(
'SELECT label,name,datatype,editable,display,normalized,id,is_multiple FROM custom_columns'):
data = {
'label':record[0],
'name':record[1],
'datatype':record[2],
'editable':bool(record[3]),
'display':json.loads(record[4]),
'normalized':bool(record[5]),
'num':record[6],
'is_multiple':bool(record[7]),
}
if data['display'] is None:
data['display'] = {}
# set up the is_multiple separator dict
if data['is_multiple']:
if data['display'].get('is_names', False):
seps = {'cache_to_list': '|', 'ui_to_list': '&', 'list_to_ui': ' & '}
elif data['datatype'] == 'composite':
seps = {'cache_to_list': ',', 'ui_to_list': ',', 'list_to_ui': ', '}
else:
seps = {'cache_to_list': '|', 'ui_to_list': ',', 'list_to_ui': ', '}
else:
seps = {}
data['multiple_seps'] = seps
table, lt = self.custom_table_names(data['num'])
if table not in custom_tables or (data['normalized'] and lt not in
custom_tables):
remove.append(data)
continue
self.custom_column_label_map[data['label']] = data['num']
self.custom_column_num_map[data['num']] = \
self.custom_column_label_map[data['label']] = data
# Create Foreign Key triggers
if data['normalized']:
trigger = 'DELETE FROM %s WHERE book=OLD.id;'%lt
else:
trigger = 'DELETE FROM %s WHERE book=OLD.id;'%table
triggers.append(trigger)
if remove:
with self.conn:
for data in remove:
prints('WARNING: Custom column %r not found, removing.' %
data['label'])
self.conn.execute('DELETE FROM custom_columns WHERE id=?',
(data['num'],))
if triggers:
with self.conn:
self.conn.execute('''\
CREATE TEMP TRIGGER custom_books_delete_trg
AFTER DELETE ON books
BEGIN
%s
END;
'''%(' \n'.join(triggers)))
# Setup data adapters
def adapt_text(x, d):
if d['is_multiple']:
if x is None:
return []
if isinstance(x, (str, unicode, bytes)):
x = x.split(d['multiple_seps']['ui_to_list'])
x = [y.strip() for y in x if y.strip()]
x = [y.decode(preferred_encoding, 'replace') if not isinstance(y,
unicode) else y for y in x]
return [u' '.join(y.split()) for y in x]
else:
return x if x is None or isinstance(x, unicode) else \
x.decode(preferred_encoding, 'replace')
def adapt_datetime(x, d):
if isinstance(x, (str, unicode, bytes)):
x = parse_date(x, assume_utc=False, as_utc=False)
return x
def adapt_bool(x, d):
if isinstance(x, (str, unicode, bytes)):
x = x.lower()
if x == 'true':
x = True
elif x == 'false':
x = False
elif x == 'none':
x = None
else:
x = bool(int(x))
return x
def adapt_enum(x, d):
v = adapt_text(x, d)
if not v:
v = None
return v
def adapt_number(x, d):
if x is None:
return None
if isinstance(x, (str, unicode, bytes)):
if x.lower() == 'none':
return None
if d['datatype'] == 'int':
return int(x)
return float(x)
self.custom_data_adapters = {
'float': adapt_number,
'int': adapt_number,
'rating':lambda x,d : x if x is None else min(10., max(0., float(x))),
'bool': adapt_bool,
'comments': lambda x,d: adapt_text(x, {'is_multiple':False}),
'datetime' : adapt_datetime,
'text':adapt_text,
'series':adapt_text,
'enumeration': adapt_enum
}
# Create Tag Browser categories for custom columns
for k in sorted(self.custom_column_label_map.iterkeys()):
v = self.custom_column_label_map[k]
if v['normalized']:
is_category = True
else:
is_category = False
is_m = v['multiple_seps']
tn = 'custom_column_{0}'.format(v['num'])
self.field_metadata.add_custom_field(label=v['label'],
table=tn, column='value', datatype=v['datatype'],
colnum=v['num'], name=v['name'], display=v['display'],
is_multiple=is_m, is_category=is_category,
is_editable=v['editable'], is_csp=False)
# }}}
def initialize_tables(self): # {{{
tables = self.tables = {}
for col in ('title', 'sort', 'author_sort', 'series_index', 'comments',
'timestamp', 'published', 'uuid', 'path', 'cover',
'last_modified'):
metadata = self.field_metadata[col].copy()
if metadata['table'] is None:
metadata['table'], metadata['column'] == 'books', ('has_cover'
if col == 'cover' else col)
tables[col] = OneToOneTable(col, metadata)
for col in ('series', 'publisher', 'rating'):
tables[col] = ManyToOneTable(col, self.field_metadata[col].copy())
for col in ('authors', 'tags', 'formats', 'identifiers'):
cls = {
'authors':AuthorsTable,
'formats':FormatsTable,
'identifiers':IdentifiersTable,
}.get(col, ManyToManyTable)
tables[col] = cls(col, self.field_metadata[col].copy())
tables['size'] = SizeTable('size', self.field_metadata['size'].copy())
for label, data in self.custom_column_label_map.iteritems():
metadata = self.field_metadata[label].copy()
link_table = self.custom_table_names(data['num'])[1]
if data['normalized']:
if metadata['is_multiple']:
tables[label] = ManyToManyTable(label, metadata,
link_table=link_table)
else:
tables[label] = ManyToOneTable(label, metadata,
link_table=link_table)
if metadata['datatype'] == 'series':
# Create series index table
label += '_index'
metadata = self.field_metadata[label].copy()
metadata['column'] = 'extra'
metadata['table'] = link_table
tables[label] = OneToOneTable(label, metadata)
else:
tables[label] = OneToOneTable(label, metadata)
# }}}
@property
def conn(self):
@ -372,6 +591,15 @@ class DB(object, SchemaUpgrade):
# Database layer API {{{
def custom_table_names(self, num):
return 'custom_column_%d'%num, 'books_custom_column_%d_link'%num
@property
def custom_tables(self):
return set([x[0] for x in self.conn.get(
'SELECT name FROM sqlite_master WHERE type="table" AND '
'(name GLOB "custom_column_*" OR name GLOB "books_custom_column_*")')])
@classmethod
def exists_at(cls, path):
return path and os.path.exists(os.path.join(path, 'metadata.db'))
@ -405,39 +633,18 @@ class DB(object, SchemaUpgrade):
return utcfromtimestamp(os.stat(self.dbpath).st_mtime)
def read_tables(self):
tables = {}
for col in ('title', 'sort', 'author_sort', 'series_index', 'comments',
'timestamp', 'published', 'uuid', 'path', 'cover',
'last_modified'):
metadata = self.field_metadata[col].copy()
if metadata['table'] is None:
metadata['table'], metadata['column'] == 'books', ('has_cover'
if col == 'cover' else col)
tables[col] = OneToOneTable(col, metadata)
for col in ('series', 'publisher', 'rating'):
tables[col] = ManyToOneTable(col, self.field_metadata[col].copy())
for col in ('authors', 'tags', 'formats', 'identifiers'):
cls = {
'authors':AuthorsTable,
'formats':FormatsTable,
'identifiers':IdentifiersTable,
}.get(col, ManyToManyTable)
tables[col] = cls(col, self.field_metadata[col].copy())
tables['size'] = SizeTable('size', self.field_metadata['size'].copy())
'''
Read all data from the db into the python in-memory tables
'''
with self.conn: # Use a single transaction, to ensure nothing modifies
# the db while we are reading
for table in tables.itervalues():
for table in self.tables.itervalues():
try:
table.read()
except:
prints('Failed to read table:', table.name)
raise
return tables
# }}}

View File

@ -32,7 +32,7 @@ def _c_convert_timestamp(val):
class Table(object):
def __init__(self, name, metadata):
def __init__(self, name, metadata, link_table=None):
self.name, self.metadata = name, metadata
# self.adapt() maps values from the db to python objects
@ -46,6 +46,9 @@ class Table(object):
# Legacy
self.adapt = lambda x: x.replace('|', ',') if x else None
self.link_table = (link_table if link_table else
'books_%s_link'%self.metadata['table'])
class OneToOneTable(Table):
'''
@ -95,8 +98,8 @@ class ManyToOneTable(Table):
def read_maps(self, db):
for row in db.conn.execute(
'SELECT book, {0} FROM books_{1}_link'.format(
self.metadata['link_column'], self.metadata['table'])):
'SELECT book, {0} FROM {1}'.format(
self.metadata['link_column'], self.link_table)):
if row[1] not in self.col_book_map:
self.col_book_map[row[1]] = []
self.col_book_map.append(row[0])
@ -112,8 +115,8 @@ class ManyToManyTable(ManyToOneTable):
def read_maps(self, db):
for row in db.conn.execute(
'SELECT book, {0} FROM books_{1}_link'.format(
self.metadata['link_column'], self.metadata['table'])):
'SELECT book, {0} FROM {1}'.format(
self.metadata['link_column'], self.link_table)):
if row[1] not in self.col_book_map:
self.col_book_map[row[1]] = []
self.col_book_map.append(row[0])