Sync to trunk.

2025-07-09 03:04:10 -04:00 · 2011-07-03 09:45:48 -04:00 · 2011-07-03 09:45:48 -04:00 · 0774cec024
commit 0774cec024
parent 75b0237931 ba16dcb128
4 changed files with 342 additions and 37 deletions
--- a/recipes/endgadget.recipe
+++ b/recipes/endgadget.recipe
@ -1,7 +1,7 @@
 #!/usr/bin/env  python

 __license__   = 'GPL v3'
-__copyright__ = '2008 - 2009, Darko Miletic <darko.miletic at gmail.com>'
+__copyright__ = 'Copyright 2011 Starson17'
 '''
 engadget.com
 '''
@ -9,14 +9,29 @@ engadget.com
 from calibre.web.feeds.news import BasicNewsRecipe

 class Engadget(BasicNewsRecipe):
-    title                 = u'Engadget'
-    __author__            = 'Darko Miletic'
+    title                 = u'Engadget_Full'
+    __author__            = 'Starson17'
+    __version__           = 'v1.00'
+    __date__              = '02, July 2011'
    description           = 'Tech news'
    language              = 'en'
    oldest_article        = 7
    max_articles_per_feed = 100
    no_stylesheets        = True
-    use_embedded_content  = True
+    use_embedded_content  = False
+    remove_javascript     = True
+    remove_empty_feeds    = True

-    feeds = [ (u'Posts', u'http://www.engadget.com/rss.xml')]
+    keep_only_tags = [dict(name='div', attrs={'class':['post_content permalink ','post_content permalink alt-post-full']})]
+    remove_tags = [dict(name='div', attrs={'class':['filed_under','post_footer']})]
+    remove_tags_after =  [dict(name='div', attrs={'class':['post_footer']})]
+
+    feeds = [(u'Posts', u'http://www.engadget.com/rss.xml')]
+
+    extra_css = '''
+                    h1{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:large;}
+                    h2{font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:small;}
+                    p{font-family:Arial,Helvetica,sans-serif;font-size:small;}
+                    body{font-family:Helvetica,Arial,sans-serif;font-size:small;}
+		'''

--- a/recipes/scmp.recipe
+++ b/recipes/scmp.recipe
@ -0,0 +1,80 @@
+__license__   = 'GPL v3'
+__copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>'
+'''
+scmp.com
+'''
+
+import re
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class SCMP(BasicNewsRecipe):
+    title                 = 'South China Morning Post'
+    __author__            = 'llam'
+    description           = "SCMP.com, Hong Kong's premier online English daily provides exclusive up-to-date news, audio video news, podcasts, RSS Feeds, Blogs, breaking news, top stories, award winning news and analysis on Hong Kong and China."
+    publisher             = 'South China Morning Post Publishers Ltd.'
+    category              = 'SCMP, Online news, Hong Kong News, China news, Business news, English newspaper, daily newspaper, Lifestyle news, Sport news, Audio Video news, Asia news, World news, economy news, investor relations news, RSS Feeds'
+    oldest_article        = 2
+    delay                 = 1
+    max_articles_per_feed = 200
+    no_stylesheets        = True
+    encoding              = 'utf-8'
+    use_embedded_content  = False
+    language              = 'en_CN'
+    remove_empty_feeds    = True
+    needs_subscription    = True
+    publication_type      = 'newspaper'
+    masthead_url          = 'http://www.scmp.com/images/logo_scmp_home.gif'
+    extra_css             = ' body{font-family: Arial,Helvetica,sans-serif } '
+
+    conversion_options = {
+                          'comment'   : description
+                        , 'tags'      : category
+                        , 'publisher' : publisher
+                        , 'language'  : language
+                        }
+
+    def get_browser(self):
+        br = BasicNewsRecipe.get_browser()
+        #br.set_debug_http(True)
+        #br.set_debug_responses(True)
+        #br.set_debug_redirects(True)
+        if self.username is not None and self.password is not None:
+           br.open('http://www.scmp.com/portal/site/SCMP/')
+           br.select_form(name='loginForm')
+           br['Login'   ] = self.username
+           br['Password'] = self.password
+           br.submit()
+        return br
+
+    remove_attributes=['width','height','border']
+
+    keep_only_tags = [
+                        dict(attrs={'id':['ART','photoBox']})
+                       ,dict(attrs={'class':['article_label','article_byline','article_body']})
+                     ]
+
+    preprocess_regexps = [
+       (re.compile(r'<P><table((?!<table).)*class="embscreen"((?!</table>).)*</table>', re.DOTALL|re.IGNORECASE),
+        lambda match: ''),
+    ]
+
+    feeds = [
+              (u'Business'     , u'http://www.scmp.com/rss/business.xml'       )
+             ,(u'Hong Kong'    , u'http://www.scmp.com/rss/hong_kong.xml'      )
+             ,(u'China'        , u'http://www.scmp.com/rss/china.xml'          )
+             ,(u'Asia & World' , u'http://www.scmp.com/rss/news_asia_world.xml')
+             ,(u'Opinion'      , u'http://www.scmp.com/rss/opinion.xml'        )
+             ,(u'LifeSTYLE'    , u'http://www.scmp.com/rss/lifestyle.xml'      )
+             ,(u'Sport'        , u'http://www.scmp.com/rss/sport.xml'          )
+            ]
+
+    def print_version(self, url):
+        rpart, sep, rest = url.rpartition('&')
+        return rpart #+ sep + urllib.quote_plus(rest)
+
+    def preprocess_html(self, soup):
+        for item in soup.findAll(style=True):
+            del item['style']
+        items = soup.findAll(src="/images/label_icon.gif")
+        [item.extract() for item in items]
+        return self.adeify_images(soup)
--- a/src/calibre/db/backend.py
+++ b/src/calibre/db/backend.py
@ -22,7 +22,7 @@ from calibre.library.field_metadata import FieldMetadata
 from calibre.ebooks.metadata import title_sort, author_to_author_sort
 from calibre.utils.icu import strcmp
 from calibre.utils.config import to_json, from_json, prefs, tweaks
-from calibre.utils.date import utcfromtimestamp
+from calibre.utils.date import utcfromtimestamp, parse_date
 from calibre.db.tables import (OneToOneTable, ManyToOneTable, ManyToManyTable,
        SizeTable, FormatsTable, AuthorsTable, IdentifiersTable)
 # }}}
@ -248,7 +248,10 @@ class DB(object, SchemaUpgrade):
        UPDATE authors SET sort=author_to_author_sort(name) WHERE sort IS NULL;
        ''')

-    def initialize_prefs(self, default_prefs):
+        self.initialize_custom_columns()
+        self.initialize_tables()
+
+    def initialize_prefs(self, default_prefs): # {{{
        self.prefs = DBPrefs(self)

        if default_prefs is not None and not self._exists:
@ -339,6 +342,222 @@ class DB(object, SchemaUpgrade):
                cats_changed = True
        if cats_changed:
            self.prefs.set('user_categories', user_cats)
+    # }}}
+
+    def initialize_custom_columns(self): # {{{
+        with self.conn:
+            # Delete previously marked custom columns
+            for record in self.conn.get(
+                    'SELECT id FROM custom_columns WHERE mark_for_delete=1'):
+                num = record[0]
+                table, lt = self.custom_table_names(num)
+                self.conn.execute('''\
+                        DROP INDEX   IF EXISTS {table}_idx;
+                        DROP INDEX   IF EXISTS {lt}_aidx;
+                        DROP INDEX   IF EXISTS {lt}_bidx;
+                        DROP TRIGGER IF EXISTS fkc_update_{lt}_a;
+                        DROP TRIGGER IF EXISTS fkc_update_{lt}_b;
+                        DROP TRIGGER IF EXISTS fkc_insert_{lt};
+                        DROP TRIGGER IF EXISTS fkc_delete_{lt};
+                        DROP TRIGGER IF EXISTS fkc_insert_{table};
+                        DROP TRIGGER IF EXISTS fkc_delete_{table};
+                        DROP VIEW    IF EXISTS tag_browser_{table};
+                        DROP VIEW    IF EXISTS tag_browser_filtered_{table};
+                        DROP TABLE   IF EXISTS {table};
+                        DROP TABLE   IF EXISTS {lt};
+                        '''.format(table=table, lt=lt)
+                )
+            self.conn.execute('DELETE FROM custom_columns WHERE mark_for_delete=1')
+
+        # Load metadata for custom columns
+        self.custom_column_label_map, self.custom_column_num_map = {}, {}
+        triggers = []
+        remove = []
+        custom_tables = self.custom_tables
+        for record in self.conn.get(
+                'SELECT label,name,datatype,editable,display,normalized,id,is_multiple FROM custom_columns'):
+            data = {
+                    'label':record[0],
+                    'name':record[1],
+                    'datatype':record[2],
+                    'editable':bool(record[3]),
+                    'display':json.loads(record[4]),
+                    'normalized':bool(record[5]),
+                    'num':record[6],
+                    'is_multiple':bool(record[7]),
+                    }
+            if data['display'] is None:
+                data['display'] = {}
+            # set up the is_multiple separator dict
+            if data['is_multiple']:
+                if data['display'].get('is_names', False):
+                    seps = {'cache_to_list': '|', 'ui_to_list': '&', 'list_to_ui': ' & '}
+                elif data['datatype'] == 'composite':
+                    seps = {'cache_to_list': ',', 'ui_to_list': ',', 'list_to_ui': ', '}
+                else:
+                    seps = {'cache_to_list': '|', 'ui_to_list': ',', 'list_to_ui': ', '}
+            else:
+                seps = {}
+            data['multiple_seps'] = seps
+
+            table, lt = self.custom_table_names(data['num'])
+            if table not in custom_tables or (data['normalized'] and lt not in
+                    custom_tables):
+                remove.append(data)
+                continue
+
+            self.custom_column_label_map[data['label']] = data['num']
+            self.custom_column_num_map[data['num']] = \
+                self.custom_column_label_map[data['label']] = data
+
+            # Create Foreign Key triggers
+            if data['normalized']:
+                trigger = 'DELETE FROM %s WHERE book=OLD.id;'%lt
+            else:
+                trigger = 'DELETE FROM %s WHERE book=OLD.id;'%table
+            triggers.append(trigger)
+
+        if remove:
+            with self.conn:
+                for data in remove:
+                    prints('WARNING: Custom column %r not found, removing.' %
+                            data['label'])
+                    self.conn.execute('DELETE FROM custom_columns WHERE id=?',
+                            (data['num'],))
+
+        if triggers:
+            with self.conn:
+                self.conn.execute('''\
+                    CREATE TEMP TRIGGER custom_books_delete_trg
+                        AFTER DELETE ON books
+                        BEGIN
+                        %s
+                    END;
+                    '''%(' \n'.join(triggers)))
+
+        # Setup data adapters
+        def adapt_text(x, d):
+            if d['is_multiple']:
+                if x is None:
+                    return []
+                if isinstance(x, (str, unicode, bytes)):
+                    x = x.split(d['multiple_seps']['ui_to_list'])
+                x = [y.strip() for y in x if y.strip()]
+                x = [y.decode(preferred_encoding, 'replace') if not isinstance(y,
+                    unicode) else y for y in x]
+                return [u' '.join(y.split()) for y in x]
+            else:
+                return x if x is None or isinstance(x, unicode) else \
+                        x.decode(preferred_encoding, 'replace')
+
+        def adapt_datetime(x, d):
+            if isinstance(x, (str, unicode, bytes)):
+                x = parse_date(x, assume_utc=False, as_utc=False)
+            return x
+
+        def adapt_bool(x, d):
+            if isinstance(x, (str, unicode, bytes)):
+                x = x.lower()
+                if x == 'true':
+                    x = True
+                elif x == 'false':
+                    x = False
+                elif x == 'none':
+                    x = None
+                else:
+                    x = bool(int(x))
+            return x
+
+        def adapt_enum(x, d):
+            v = adapt_text(x, d)
+            if not v:
+                v = None
+            return v
+
+        def adapt_number(x, d):
+            if x is None:
+                return None
+            if isinstance(x, (str, unicode, bytes)):
+                if x.lower() == 'none':
+                    return None
+            if d['datatype'] == 'int':
+                return int(x)
+            return float(x)
+
+        self.custom_data_adapters = {
+                'float': adapt_number,
+                'int':   adapt_number,
+                'rating':lambda x,d : x if x is None else min(10., max(0., float(x))),
+                'bool':  adapt_bool,
+                'comments': lambda x,d: adapt_text(x, {'is_multiple':False}),
+                'datetime' : adapt_datetime,
+                'text':adapt_text,
+                'series':adapt_text,
+                'enumeration': adapt_enum
+        }
+
+        # Create Tag Browser categories for custom columns
+        for k in sorted(self.custom_column_label_map.iterkeys()):
+            v = self.custom_column_label_map[k]
+            if v['normalized']:
+                is_category = True
+            else:
+                is_category = False
+            is_m = v['multiple_seps']
+            tn = 'custom_column_{0}'.format(v['num'])
+            self.field_metadata.add_custom_field(label=v['label'],
+                    table=tn, column='value', datatype=v['datatype'],
+                    colnum=v['num'], name=v['name'], display=v['display'],
+                    is_multiple=is_m, is_category=is_category,
+                    is_editable=v['editable'], is_csp=False)
+
+    # }}}
+
+    def initialize_tables(self): # {{{
+        tables = self.tables = {}
+        for col in ('title', 'sort', 'author_sort', 'series_index', 'comments',
+                'timestamp', 'published', 'uuid', 'path', 'cover',
+                'last_modified'):
+            metadata = self.field_metadata[col].copy()
+            if metadata['table'] is None:
+                metadata['table'], metadata['column'] == 'books', ('has_cover'
+                        if col == 'cover' else col)
+            tables[col] = OneToOneTable(col, metadata)
+
+        for col in ('series', 'publisher', 'rating'):
+            tables[col] = ManyToOneTable(col, self.field_metadata[col].copy())
+
+        for col in ('authors', 'tags', 'formats', 'identifiers'):
+            cls = {
+                    'authors':AuthorsTable,
+                    'formats':FormatsTable,
+                    'identifiers':IdentifiersTable,
+                  }.get(col, ManyToManyTable)
+            tables[col] = cls(col, self.field_metadata[col].copy())
+
+        tables['size'] = SizeTable('size', self.field_metadata['size'].copy())
+
+        for label, data in self.custom_column_label_map.iteritems():
+            metadata = self.field_metadata[label].copy()
+            link_table = self.custom_table_names(data['num'])[1]
+
+            if data['normalized']:
+                if metadata['is_multiple']:
+                    tables[label] = ManyToManyTable(label, metadata,
+                            link_table=link_table)
+                else:
+                    tables[label] = ManyToOneTable(label, metadata,
+                            link_table=link_table)
+                    if metadata['datatype'] == 'series':
+                        # Create series index table
+                        label += '_index'
+                        metadata = self.field_metadata[label].copy()
+                        metadata['column'] = 'extra'
+                        metadata['table'] = link_table
+                        tables[label] = OneToOneTable(label, metadata)
+            else:
+                tables[label] = OneToOneTable(label, metadata)
+    # }}}

    @property
    def conn(self):
@ -372,6 +591,15 @@ class DB(object, SchemaUpgrade):

    # Database layer API {{{

+    def custom_table_names(self, num):
+        return 'custom_column_%d'%num, 'books_custom_column_%d_link'%num
+
+    @property
+    def custom_tables(self):
+        return set([x[0] for x in self.conn.get(
+            'SELECT name FROM sqlite_master WHERE type="table" AND '
+            '(name GLOB "custom_column_*" OR name GLOB "books_custom_column_*")')])
+
    @classmethod
    def exists_at(cls, path):
        return path and os.path.exists(os.path.join(path, 'metadata.db'))
@ -405,39 +633,18 @@ class DB(object, SchemaUpgrade):
        return utcfromtimestamp(os.stat(self.dbpath).st_mtime)

    def read_tables(self):
-        tables = {}
-        for col in ('title', 'sort', 'author_sort', 'series_index', 'comments',
-                'timestamp', 'published', 'uuid', 'path', 'cover',
-                'last_modified'):
-            metadata = self.field_metadata[col].copy()
-            if metadata['table'] is None:
-                metadata['table'], metadata['column'] == 'books', ('has_cover'
-                        if col == 'cover' else col)
-            tables[col] = OneToOneTable(col, metadata)
-
-        for col in ('series', 'publisher', 'rating'):
-            tables[col] = ManyToOneTable(col, self.field_metadata[col].copy())
-
-        for col in ('authors', 'tags', 'formats', 'identifiers'):
-            cls = {
-                    'authors':AuthorsTable,
-                    'formats':FormatsTable,
-                    'identifiers':IdentifiersTable,
-                  }.get(col, ManyToManyTable)
-            tables[col] = cls(col, self.field_metadata[col].copy())
-
-        tables['size'] = SizeTable('size', self.field_metadata['size'].copy())
+        '''
+        Read all data from the db into the python in-memory tables
+        '''

        with self.conn: # Use a single transaction, to ensure nothing modifies
                        # the db while we are reading
-            for table in tables.itervalues():
+            for table in self.tables.itervalues():
                try:
                    table.read()
                except:
                    prints('Failed to read table:', table.name)
                    raise

-        return tables
-
   # }}}

--- a/src/calibre/db/tables.py
+++ b/src/calibre/db/tables.py
@ -32,7 +32,7 @@ def _c_convert_timestamp(val):

 class Table(object):

-    def __init__(self, name, metadata):
+    def __init__(self, name, metadata, link_table=None):
        self.name, self.metadata = name, metadata

        # self.adapt() maps values from the db to python objects
@ -46,6 +46,9 @@ class Table(object):
            # Legacy
            self.adapt = lambda x: x.replace('|', ',') if x else None

+        self.link_table = (link_table if link_table else
+                'books_%s_link'%self.metadata['table'])
+
 class OneToOneTable(Table):

    '''
@ -95,8 +98,8 @@ class ManyToOneTable(Table):

    def read_maps(self, db):
        for row in db.conn.execute(
-                'SELECT book, {0} FROM books_{1}_link'.format(
-                    self.metadata['link_column'], self.metadata['table'])):
+                'SELECT book, {0} FROM {1}'.format(
+                    self.metadata['link_column'], self.link_table)):
            if row[1] not in self.col_book_map:
                self.col_book_map[row[1]] = []
            self.col_book_map.append(row[0])
@ -112,8 +115,8 @@ class ManyToManyTable(ManyToOneTable):

    def read_maps(self, db):
        for row in db.conn.execute(
-                'SELECT book, {0} FROM books_{1}_link'.format(
-                    self.metadata['link_column'], self.metadata['table'])):
+                'SELECT book, {0} FROM {1}'.format(
+                    self.metadata['link_column'], self.link_table)):
            if row[1] not in self.col_book_map:
                self.col_book_map[row[1]] = []
            self.col_book_map.append(row[0])