merge from trunk

2025-12-08 14:15:04 -05:00 · 2013-05-04 18:47:17 +02:00 · 2013-05-04 18:47:17 +02:00 · c013ece5af
commit c013ece5af
parent ccd0a463ec e7268bc39f
13 changed files with 356 additions and 104 deletions
--- a/manual/gui.rst
+++ b/manual/gui.rst
@ -586,6 +586,8 @@ Calibre has several keyboard shortcuts to save you time and mouse movement. Thes
      - Focus the book list
    * - :kbd:`Ctrl+Esc`
      - Clear the virtual library
    * - :kbd:`Alt+Esc`
      - Clear the additional restriction
    * - :kbd:`N or F3`
      - Find the next book that matches the current search (only works if the highlight checkbox next to the search bar is checked)
    * - :kbd:`Shift+N or Shift+F3`
--- a/recipes/the_sun.recipe
+++ b/recipes/the_sun.recipe
@ -1,4 +1,4 @@
-import re, random
+import random
 from calibre import browser
 from calibre.web.feeds.recipes import BasicNewsRecipe
@ -8,7 +8,7 @@ class AdvancedUserRecipe1325006965(BasicNewsRecipe):
    title          = u'The Sun UK'
    description = 'Articles from The Sun tabloid UK'
    __author__ = 'Dave Asbury'
-    # last updated 19/10/12 better cover fetch
+    # last updated 5/5/13 better cover fetch
    language = 'en_GB'
    oldest_article = 1
    max_articles_per_feed = 15
@ -29,16 +29,12 @@ class AdvancedUserRecipe1325006965(BasicNewsRecipe):
        dict(name='div',attrs={'class' : 'intro'}),
                                dict(name='h3'),
        dict(name='div',attrs={'id' : 'articlebody'}),
           #dict(attrs={'class' : ['right_col_branding','related-stories','mystery-meat-link','ltbx-container','ltbx-var ltbx-hbxpn','ltbx-var ltbx-nav-loop','ltbx-var ltbx-url']}),
           #                dict(name='div',attrs={'class' : 'cf'}),
          # dict(attrs={'title' : 'download flash'}),
          #                 dict(attrs={'style' : 'padding: 5px'})
-           ]
+    ]
    remove_tags_after = [dict(id='bodyText')]
    remove_tags=[
-                  dict(name='li'),
+                    dict(name='li'),
-                              dict(attrs={'class' : 'grid-4 right-hand-column'}),
+                    dict(attrs={'class' : 'grid-4 right-hand-column'}),
        ]
    feeds          = [
@ -47,40 +43,24 @@ class AdvancedUserRecipe1325006965(BasicNewsRecipe):
    (u'Showbiz', u'http://www.thesun.co.uk/sol/homepage/showbiz/rss'),
    (u'Woman', u'http://www.thesun.co.uk/sol/homepage/woman/rss'),
    ]
-# starsons code
+    # starsons code
-    def parse_feeds (self):
+    def parse_feeds(self):
-      feeds = BasicNewsRecipe.parse_feeds(self)
+        feeds = BasicNewsRecipe.parse_feeds(self)
-      for feed in feeds:
+        for feed in feeds:
-        for article in feed.articles[:]:
+            for article in feed.articles[:]:
-          print 'article.title is: ', article.title
+                if 'Try out The Sun' in article.title.upper() or 'Try-out-The-Suns' in article.url:
-          if 'Try out The Sun' in article.title.upper() or 'Try-out-The-Suns' in article.url:
+                    feed.articles.remove(article)
-            feed.articles.remove(article)
+                if 'Web porn harms kids' in article.title.upper() or 'Sun-says-Web-porn' in article.url:
-          if 'Web porn harms kids' in article.title.upper() or 'Sun-says-Web-porn' in article.url:
+                    feed.articles.remove(article)
-            feed.articles.remove(article)
+        return feeds
      return feeds
    def get_cover_url(self):
        soup = self.index_to_soup('http://www.politicshome.com/uk/latest_frontpage.html')
        # look for the block containing the sun button and url
        cov = soup.find(attrs={'style' : 'background-image: url(http://www.politicshome.com/images/sources/source_frontpage_button_84.gif);'})
        #cov = soup.find(attrs={'id' : 'large'})
        cov2 = str(cov)
        cov2='http://www.politicshome.com'+cov2[9:-133]
        #cov2 now contains url of the page containing pic
        #cov2 now contains url of the page containing pic
        soup = self.index_to_soup(cov2)
        cov = soup.find(attrs={'id' : 'large'})
        cov=str(cov)
        cov2 =  re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', cov)
        cov2 = str(cov2)
        cov2=cov2[2:len(cov2)-2]
        br = browser()
        br.set_handle_redirect(False)
        cover_url = 'http://www.thepaperboy.com/frontpages/current/The_Sun_newspaper_front_page.jpg'
        try:
-            br.open_novisit(cov2)
+            br.open_novisit('http://www.thepaperboy.com/frontpages/current/The_Sun_newspaper_front_page.jpg')
            cover_url = cov2
        except:
            cover_url = random.choice([
                'http://img.thesun.co.uk/multimedia/archive/00905/errorpage6_677961a_905507a.jpg'
@ -88,6 +68,6 @@ class AdvancedUserRecipe1325006965(BasicNewsRecipe):
                ,'http://img.thesun.co.uk/multimedia/archive/00905/errorpage5_677960a_905512a.jpg'
                ,'http://img.thesun.co.uk/multimedia/archive/00905/errorpage2_677957a_905502a.jpg'
                ,'http://img.thesun.co.uk/multimedia/archive/00905/errorpage3_677958a_905503a.jpg'
-                ])
+            ])
        return cover_url
--- a/src/calibre/ebooks/docx/init.py
+++ b/src/calibre/ebooks/docx/init.py
@ -0,0 +1,11 @@
 #!/usr/bin/env python
 # vim:fileencoding=utf-8
 from __future__ import (unicode_literals, division, absolute_import,
                        print_function)
 __license__ = 'GPL v3'
 __copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
 class InvalidDOCX(ValueError):
    pass
--- a/src/calibre/ebooks/docx/container.py
+++ b/src/calibre/ebooks/docx/container.py
@ -0,0 +1,203 @@
 #!/usr/bin/env python
 # vim:fileencoding=utf-8
 from __future__ import (unicode_literals, division, absolute_import,
                        print_function)
 __license__ = 'GPL v3'
 __copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
 import os, sys, shutil
 from lxml import etree
 from calibre import walk, guess_type
 from calibre.ebooks.metadata import string_to_authors
 from calibre.ebooks.metadata.book.base import Metadata
 from calibre.ebooks.docx import InvalidDOCX
 from calibre.ebooks.docx.names import DOCUMENT, DOCPROPS, XPath, APPPROPS
 from calibre.ptempfile import PersistentTemporaryDirectory
 from calibre.utils.localization import canonicalize_lang
 from calibre.utils.logging import default_log
 from calibre.utils.zipfile import ZipFile
 from calibre.ebooks.oeb.parse_utils import RECOVER_PARSER
 def fromstring(raw, parser=RECOVER_PARSER):
    return etree.fromstring(raw, parser=parser)
 # Read metadata {{{
 def read_doc_props(raw, mi):
    root = fromstring(raw)
    titles = XPath('//dc:title')(root)
    if titles:
        title = titles[0].text
        if title and title.strip():
            mi.title = title.strip()
    tags = []
    for subject in XPath('//dc:subject')(root):
        if subject.text and subject.text.strip():
            tags.append(subject.text.strip().replace(',', '_'))
    for keywords in XPath('//cp:keywords')(root):
        if keywords.text and keywords.text.strip():
            for x in keywords.text.split():
                tags.extend(y.strip() for y in x.split(','))
    if tags:
        mi.tags = tags
    authors = XPath('//dc:creator')(root)
    aut = []
    for author in authors:
        if author.text and author.text.strip():
            aut.extend(string_to_authors(author.text))
    if aut:
        mi.authors = aut
    desc = XPath('//dc:description')(root)
    if desc:
        raw = etree.tostring(desc[0], method='text', encoding=unicode)
        mi.comments = raw
    langs = []
    for lang in XPath('//dc:language')(root):
        if lang.text and lang.text.strip():
            l = canonicalize_lang(lang.text)
            if l:
                langs.append(l)
    if langs:
        mi.languages = langs
 def read_app_props(raw, mi):
    root = fromstring(raw)
    company = root.xpath('//*[local-name()="Company"]')
    if company and company[0].text and company[0].text.strip():
        mi.publisher = company[0].text.strip()
 # }}}
 class DOCX(object):
    def __init__(self, path_or_stream, log=None, extract=True):
        stream = path_or_stream if hasattr(path_or_stream, 'read') else open(path_or_stream, 'rb')
        self.name = getattr(stream, 'name', None) or '<stream>'
        self.log = log or default_log
        if extract:
            self.extract(stream)
        else:
            self.init_zipfile(stream)
        self.read_content_types()
        self.read_package_relationships()
    def init_zipfile(self, stream):
        self.zipf = ZipFile(stream)
        self.names = frozenset(self.zipf.namelist())
    def extract(self, stream):
        self.tdir = PersistentTemporaryDirectory('docx_container')
        try:
            zf = ZipFile(stream)
            zf.extractall(self.tdir)
        except:
            self.log.exception('DOCX appears to be invalid ZIP file, trying a'
                    ' more forgiving ZIP parser')
            from calibre.utils.localunzip import extractall
            stream.seek(0)
            extractall(stream, self.tdir)
        self.names = {}
        for f in walk(self.tdir):
            name = os.path.relpath(f, self.tdir).replace(os.sep, '/')
            self.names[name] = f
    def read(self, name):
        if hasattr(self, 'zipf'):
            return self.zipf.open(name).read()
        path = self.names[name]
        with open(path, 'rb') as f:
            return f.read()
    def read_content_types(self):
        try:
            raw = self.read('[Content_Types].xml')
        except KeyError:
            raise InvalidDOCX('The file %s docx file has no [Content_Types].xml' % self.name)
        root = fromstring(raw)
        self.content_types = {}
        self.default_content_types = {}
        for item in root.xpath('//*[local-name()="Types"]/*[local-name()="Default" and @Extension and @ContentType]'):
            self.default_content_types[item.get('Extension').lower()] = item.get('ContentType')
        for item in root.xpath('//*[local-name()="Types"]/*[local-name()="Override" and @PartName and @ContentType]'):
            name = item.get('PartName').lstrip('/')
            self.content_types[name] = item.get('ContentType')
    def content_type(self, name):
        if name in self.content_types:
            return self.content_types[name]
        ext = name.rpartition('.')[-1].lower()
        if ext in self.default_content_types:
            return self.default_content_types[ext]
        return guess_type(name)[0]
    def read_package_relationships(self):
        try:
            raw = self.read('_rels/.rels')
        except KeyError:
            raise InvalidDOCX('The file %s docx file has no _rels/.rels' % self.name)
        root = fromstring(raw)
        self.relationships = {}
        self.relationships_rmap = {}
        for item in root.xpath('//*[local-name()="Relationships"]/*[local-name()="Relationship" and @Type and @Target]'):
            target = item.get('Target').lstrip('/')
            typ = item.get('Type')
            self.relationships[typ] = target
            self.relationships_rmap[target] = typ
    @property
    def document(self):
        name = self.relationships.get(DOCUMENT, None)
        if name is None:
            names = tuple(n for n in self.names if n == 'document.xml' or n.endswith('/document.xml'))
            if not names:
                raise InvalidDOCX('The file %s docx file has no main document' % self.name)
            name = names[0]
        return fromstring(self.read(name))
    @property
    def metadata(self):
        mi = Metadata(_('Unknown'))
        name = self.relationships.get(DOCPROPS, None)
        if name is None:
            names = tuple(n for n in self.names if n.lower() == 'docprops/core.xml')
            if names:
                name = names[0]
        if name:
            try:
                raw = self.read(name)
            except KeyError:
                pass
            else:
                read_doc_props(raw, mi)
        name = self.relationships.get(APPPROPS, None)
        if name is None:
            names = tuple(n for n in self.names if n.lower() == 'docprops/app.xml')
            if names:
                name = names[0]
        if name:
            try:
                raw = self.read(name)
            except KeyError:
                pass
            else:
                read_app_props(raw, mi)
        return mi
    def close(self):
        if hasattr(self, 'zipf'):
            self.zipf.close()
        else:
            try:
                shutil.rmtree(self.tdir)
            except EnvironmentError:
                pass
 if __name__ == '__main__':
    d = DOCX(sys.argv[-1], extract=False)
    print (d.metadata)
--- a/src/calibre/ebooks/docx/names.py
+++ b/src/calibre/ebooks/docx/names.py
@ -0,0 +1,47 @@
 #!/usr/bin/env python
 # vim:fileencoding=utf-8
 from __future__ import (unicode_literals, division, absolute_import,
                        print_function)
 __license__ = 'GPL v3'
 __copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
 from lxml.etree import XPath as X
 DOCUMENT = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument'
 DOCPROPS = 'http://schemas.openxmlformats.org/package/2006/relationships/metadata/core-properties'
 APPPROPS = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/extended-properties'
 namespaces = {
    'mo': 'http://schemas.microsoft.com/office/mac/office/2008/main',
    'o': 'urn:schemas-microsoft-com:office:office',
    've': 'http://schemas.openxmlformats.org/markup-compatibility/2006',
    # Text Content
    'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main',
    'w10': 'urn:schemas-microsoft-com:office:word',
    'wne': 'http://schemas.microsoft.com/office/word/2006/wordml',
    # Drawing
    'a': 'http://schemas.openxmlformats.org/drawingml/2006/main',
    'm': 'http://schemas.openxmlformats.org/officeDocument/2006/math',
    'mv': 'urn:schemas-microsoft-com:mac:vml',
    'pic': 'http://schemas.openxmlformats.org/drawingml/2006/picture',
    'v': 'urn:schemas-microsoft-com:vml',
    'wp': 'http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing',
    # Properties (core and extended)
    'cp': 'http://schemas.openxmlformats.org/package/2006/metadata/core-properties',
    'dc': 'http://purl.org/dc/elements/1.1/',
    'ep': 'http://schemas.openxmlformats.org/officeDocument/2006/extended-properties',
    'xsi': 'http://www.w3.org/2001/XMLSchema-instance',
    # Content Types
    'ct': 'http://schemas.openxmlformats.org/package/2006/content-types',
    # Package Relationships
    'r': 'http://schemas.openxmlformats.org/officeDocument/2006/relationships',
    'pr': 'http://schemas.openxmlformats.org/package/2006/relationships',
    # Dublin Core document properties
    'dcmitype': 'http://purl.org/dc/dcmitype/',
    'dcterms': 'http://purl.org/dc/terms/'
 }
 def XPath(expr):
    return X(expr, namespaces=namespaces)
--- a/src/calibre/ebooks/docx/to_html.py
+++ b/src/calibre/ebooks/docx/to_html.py
@ -0,0 +1,41 @@
 #!/usr/bin/env python
 # vim:fileencoding=utf-8
 from __future__ import (unicode_literals, division, absolute_import,
                        print_function)
 __license__ = 'GPL v3'
 __copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
 import sys, os
 from lxml import html
 from lxml.html.builder import (HTML, HEAD, TITLE, BODY, LINK, META)
 from calibre.ebooks.docx.container import Container
 class Convert(object):
    def __init__(self, path_or_stream, dest_dir=None, log=None):
        self.container = Container(path_or_stream, log=log)
        self.log = self.container.log
        self.dest_dir = dest_dir or os.getcwdu()
        self.body = BODY()
        self.html = HTML(
            HEAD(
                META(charset='utf-8'),
                TITLE('TODO: read from metadata'),
                LINK(rel='stylesheet', type='text/css', href='docx.css'),
            ),
            self.body
        )
    def __call__(self):
        self.write()
    def write(self):
        raw = html.tostring(self.html, encoding='utf-8', doctype='<!DOCTYPE html>')
        with open(os.path.join(self.dest_dir, 'index.html'), 'wb') as f:
            f.write(raw)
 if __name__ == '__main__':
    Convert(sys.argv[-1])()
--- a/src/calibre/ebooks/metadata/book/base.py
+++ b/src/calibre/ebooks/metadata/book/base.py
@ -178,6 +178,8 @@ class Metadata(object):
        return key in object.__getattribute__(self, '_data')
    def deepcopy(self):
        ''' Do not use this method unless you know what you are doing, if you want to create a simple clone of
        this object, use :method:`deepcopy_metadata` instead. '''
        m = Metadata(None)
        m.__dict__ = copy.deepcopy(self.__dict__)
        object.__setattr__(m, '_data', copy.deepcopy(object.__getattribute__(self, '_data')))
--- a/src/calibre/ebooks/metadata/book/formatter.py
+++ b/src/calibre/ebooks/metadata/book/formatter.py
@ -14,16 +14,15 @@ class SafeFormat(TemplateFormatter):
    def __init__(self):
        TemplateFormatter.__init__(self)
        from calibre.ebooks.metadata.book.base import field_metadata
        self.field_metadata = field_metadata
    def get_value(self, orig_key, args, kwargs):
        if not orig_key:
            return ''
        key = orig_key = orig_key.lower()
-        if key != 'title_sort' and key not in TOP_LEVEL_IDENTIFIERS and \
+        if (key != 'title_sort' and key not in TOP_LEVEL_IDENTIFIERS and
-                key not in ALL_METADATA_FIELDS:
+                key not in ALL_METADATA_FIELDS):
-            key = self.field_metadata.search_term_to_field_key(key)
+            from calibre.ebooks.metadata.book.base import field_metadata
            key = field_metadata.search_term_to_field_key(key)
            if key is None or (self.book and
                                key not in self.book.all_field_keys()):
                if hasattr(self.book, orig_key):
--- a/src/calibre/ebooks/metadata/docx.py
+++ b/src/calibre/ebooks/metadata/docx.py
@ -7,70 +7,21 @@ __license__   = 'GPL v3'
 __copyright__ = '2012, Kovid Goyal <kovid at kovidgoyal.net>'
 __docformat__ = 'restructuredtext en'
-from lxml import etree
+from calibre.ebooks.docx.container import DOCX
 from calibre.ebooks.metadata.book.base import Metadata
 from calibre.utils.zipfile import ZipFile
 from calibre.utils.magick.draw import identify_data
 from calibre.ebooks.oeb.base import DC11_NS
 from calibre.ebooks.oeb.parse_utils import RECOVER_PARSER
 NSMAP = {'dc':DC11_NS,
 'cp':'http://schemas.openxmlformats.org/package/2006/metadata/core-properties'}
 def XPath(expr):
    return etree.XPath(expr, namespaces=NSMAP)
 def _read_doc_props(raw, mi):
    from calibre.ebooks.metadata import string_to_authors
    root = etree.fromstring(raw, parser=RECOVER_PARSER)
    titles = XPath('//dc:title')(root)
    if titles:
        title = titles[0].text
        if title and title.strip():
            mi.title = title.strip()
    tags = []
    for subject in XPath('//dc:subject')(root):
        if subject.text and subject.text.strip():
            tags.append(subject.text.strip().replace(',', '_'))
    for keywords in XPath('//cp:keywords')(root):
        if keywords.text and keywords.text.strip():
            for x in keywords.text.split():
                tags.extend(y.strip() for y in x.split(','))
    if tags:
        mi.tags = tags
    authors = XPath('//dc:creator')(root)
    aut = []
    for author in authors:
        if author.text and author.text.strip():
            aut.extend(string_to_authors(author.text))
    if aut:
        mi.authors = aut
    desc = XPath('//dc:description')(root)
    if desc:
        raw = etree.tostring(desc[0], method='text', encoding=unicode)
        mi.comments = raw
 def _read_app_props(raw, mi):
    root = etree.fromstring(raw, parser=RECOVER_PARSER)
    company = root.xpath('//*[local-name()="Company"]')
    if company and company[0].text and company[0].text.strip():
        mi.publisher = company[0].text.strip()
 def get_metadata(stream):
    c = DOCX(stream, extract=False)
    mi = c.metadata
    c.close()
    stream.seek(0)
    cdata = None
    with ZipFile(stream, 'r') as zf:
        mi = Metadata(_('Unknown'))
        cdata = None
        for zi in zf.infolist():
            ext = zi.filename.rpartition('.')[-1].lower()
-            if zi.filename.lower() == 'docprops/core.xml':
+            if cdata is None and ext in {'jpeg', 'jpg', 'png', 'gif'}:
                _read_doc_props(zf.read(zi), mi)
            elif zi.filename.lower() == 'docprops/app.xml':
                _read_app_props(zf.read(zi), mi)
            elif cdata is None and ext in {'jpeg', 'jpg', 'png', 'gif'}:
                raw = zf.read(zi)
                try:
                    width, height, fmt = identify_data(raw)
--- a/src/calibre/ebooks/oeb/reader.py
+++ b/src/calibre/ebooks/oeb/reader.py
@ -24,6 +24,7 @@ from calibre.ebooks.oeb.base import namespace, barename, XPath, xpath, \
                                    urlnormalize, BINARY_MIME, \
                                    OEBError, OEBBook, DirContainer
 from calibre.ebooks.oeb.writer import OEBWriter
 from calibre.utils.cleantext import clean_xml_chars
 from calibre.utils.localization import get_lang
 from calibre.ptempfile import TemporaryDirectory
 from calibre.constants import __appname__, __version__
@ -106,7 +107,7 @@ class OEBReader(object):
        try:
            opf = etree.fromstring(data)
        except etree.XMLSyntaxError:
-            data = xml_replace_entities(data, encoding=None)
+            data = xml_replace_entities(clean_xml_chars(data), encoding=None)
            try:
                opf = etree.fromstring(data)
                self.logger.warn('OPF contains invalid HTML named entities')
--- a/src/calibre/gui2/actions/edit_metadata.py
+++ b/src/calibre/gui2/actions/edit_metadata.py
@ -240,9 +240,10 @@ class EditMetadataAction(InterfaceAction):
                        opf, cov = id_map[book_id]
                        cfile = mi.cover
                        mi.cover, mi.cover_data = None, (None, None)
-                        with open(opf, 'wb') as f:
+                        if opf is not None:
-                            f.write(metadata_to_opf(mi))
+                            with open(opf, 'wb') as f:
-                        if cfile:
+                                f.write(metadata_to_opf(mi))
                        if cfile and cov:
                            shutil.copyfile(cfile, cov)
                            os.remove(cfile)
                    nid_map[book_id] = id_map[book_id]
--- a/src/calibre/gui2/search_restriction_mixin.py
+++ b/src/calibre/gui2/search_restriction_mixin.py
@ -549,6 +549,9 @@ class SearchRestrictionMixin(object):
                restriction = ''
            self._apply_search_restriction(restriction, r)
    def clear_additional_restriction(self):
        self._apply_search_restriction('', '')
    def _apply_search_restriction(self, restriction, name):
        self.saved_search.clear()
        # The order below is important. Set the restriction, force a '' search
@ -561,6 +564,10 @@ class SearchRestrictionMixin(object):
        self.set_number_of_books_shown()
        self.current_view().setFocus(Qt.OtherFocusReason)
        self.set_window_title()
        v = self.current_view()
        if not v.currentIndex().isValid():
            v.set_current_row()
        v.refresh_book_details()
    def set_number_of_books_shown(self):
        db = self.library_view.model().db
--- a/src/calibre/gui2/ui.py
+++ b/src/calibre/gui2/ui.py
@ -279,6 +279,13 @@ class Main(MainWindow, MainWindowMixin, DeviceMixin, EmailMixin,  # {{{
                action=self.ctrl_esc_action)
        self.ctrl_esc_action.triggered.connect(self.ctrl_esc)
        self.alt_esc_action = QAction(self)
        self.addAction(self.alt_esc_action)
        self.keyboard.register_shortcut('clear additional restriction',
                _('Clear the additional restriction'), default_keys=('Alt+Esc',),
                action=self.alt_esc_action)
        self.alt_esc_action.triggered.connect(self.clear_additional_restriction)
        ####################### Start spare job server ########################
        QTimer.singleShot(1000, self.add_spare_server)