From 8f08d9446dc5578f558d1e74c94e019c51cb7961 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Sat, 4 May 2013 07:46:41 +0530
Subject: [PATCH 01/11] Start work on docx input plugin

---
 src/calibre/ebooks/docx/__init__.py  |  11 +++
 src/calibre/ebooks/docx/container.py | 100 +++++++++++++++++++++++++++
 src/calibre/ebooks/docx/names.py     |  40 +++++++++++
 3 files changed, 151 insertions(+)
 create mode 100644 src/calibre/ebooks/docx/__init__.py
 create mode 100644 src/calibre/ebooks/docx/container.py
 create mode 100644 src/calibre/ebooks/docx/names.py
diff --git a/src/calibre/ebooks/docx/__init__.py b/src/calibre/ebooks/docx/__init__.py
new file mode 100644
index 0000000000..f8bda2506d
--- /dev/null
+++ b/src/calibre/ebooks/docx/__init__.py
@@ -0,0 +1,11 @@
+#!/usr/bin/env python
+# vim:fileencoding=utf-8
+from __future__ import (unicode_literals, division, absolute_import,
+                        print_function)
+
+__license__ = 'GPL v3'
+__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
+
+class InvalidDOCX(ValueError):
+    pass
+
diff --git a/src/calibre/ebooks/docx/container.py b/src/calibre/ebooks/docx/container.py
new file mode 100644
index 0000000000..efbe7b8fcb
--- /dev/null
+++ b/src/calibre/ebooks/docx/container.py
@@ -0,0 +1,100 @@
+#!/usr/bin/env python
+# vim:fileencoding=utf-8
+from __future__ import (unicode_literals, division, absolute_import,
+                        print_function)
+
+__license__ = 'GPL v3'
+__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
+
+import os, sys
+
+from lxml import etree
+
+from calibre import walk, guess_type
+from calibre.ebooks.docx import InvalidDOCX
+from calibre.ebooks.docx.names import DOCUMENT
+from calibre.ptempfile import PersistentTemporaryDirectory
+from calibre.utils.logging import default_log
+from calibre.utils.zipfile import ZipFile
+
+class DOCX(object):
+
+    def __init__(self, path_or_stream, log=None):
+        stream = path_or_stream if hasattr(path_or_stream, 'read') else open(path_or_stream, 'rb')
+        self.name = getattr(stream, 'name', None) or '<stream>'
+        self.log = log or default_log
+        self.tdir = PersistentTemporaryDirectory('docx_container')
+
+        self.extract(stream)
+        self.read_content_types()
+        self.read_package_relationships()
+
+    def extract(self, stream):
+        try:
+            zf = ZipFile(stream)
+            zf.extractall(self.tdir)
+        except:
+            self.log.exception('DOCX appears to be invalid ZIP file, trying a'
+                    ' more forgiving ZIP parser')
+            from calibre.utils.localunzip import extractall
+            stream.seek(0)
+            extractall(stream, self.tdir)
+
+        self.names = {}
+        for f in walk(self.tdir):
+            name = os.path.relpath(f, self.tdir).replace(os.sep, '/')
+            self.names[name] = f
+
+    def read(self, name):
+        path = self.names[name]
+        with open(path, 'rb') as f:
+            return f.read()
+
+    def read_content_types(self):
+        try:
+            raw = self.read('[Content_Types].xml')
+        except KeyError:
+            raise InvalidDOCX('The file %s docx file has no [Content_Types].xml' % self.name)
+        root = etree.fromstring(raw)
+        self.content_types = {}
+        self.default_content_types = {}
+        for item in root.xpath('//*[local-name()="Types"]/*[local-name()="Default" and @Extension and @ContentType]'):
+            self.default_content_types[item.get('Extension').lower()] = item.get('ContentType')
+        for item in root.xpath('//*[local-name()="Types"]/*[local-name()="Override" and @PartName and @ContentType]'):
+            name = item.get('PartName').lstrip('/')
+            self.content_types[name] = item.get('ContentType')
+
+    def content_type(self, name):
+        if name in self.content_types:
+            return self.content_types[name]
+        ext = name.rpartition('.')[-1].lower()
+        if ext in self.default_content_types:
+            return self.default_content_types[ext]
+        return guess_type(name)[0]
+
+    def read_package_relationships(self):
+        try:
+            raw = self.read('_rels/.rels')
+        except KeyError:
+            raise InvalidDOCX('The file %s docx file has no _rels/.rels' % self.name)
+        root = etree.fromstring(raw)
+        self.relationships = {}
+        self.relationships_rmap = {}
+        for item in root.xpath('//*[local-name()="Relationships"]/*[local-name()="Relationship" and @Type and @Target]'):
+            target = item.get('Target').lstrip('/')
+            typ = item.get('Type')
+            self.relationships[typ] = target
+            self.relationships_rmap[target] = typ
+
+    @property
+    def document(self):
+        name = self.relationships.get(DOCUMENT, None)
+        if name is None:
+            names = tuple(n for n in self.names if n == 'document.xml' or n.endswith('/document.xml'))
+            if not names:
+                raise InvalidDOCX('The file %s docx file has no main document' % self.name)
+            name = names[0]
+        return etree.fromstring(self.read(name))
+
+if __name__ == '__main__':
+    d = DOCX(sys.argv[-1])
diff --git a/src/calibre/ebooks/docx/names.py b/src/calibre/ebooks/docx/names.py
new file mode 100644
index 0000000000..0a31d08ab7
--- /dev/null
+++ b/src/calibre/ebooks/docx/names.py
@@ -0,0 +1,40 @@
+#!/usr/bin/env python
+# vim:fileencoding=utf-8
+from __future__ import (unicode_literals, division, absolute_import,
+                        print_function)
+
+__license__ = 'GPL v3'
+__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
+
+DOCUMENT = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument'
+
+namespaces = {
+    'mo': 'http://schemas.microsoft.com/office/mac/office/2008/main',
+    'o': 'urn:schemas-microsoft-com:office:office',
+    've': 'http://schemas.openxmlformats.org/markup-compatibility/2006',
+    # Text Content
+    'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main',
+    'w10': 'urn:schemas-microsoft-com:office:word',
+    'wne': 'http://schemas.microsoft.com/office/word/2006/wordml',
+    # Drawing
+    'a': 'http://schemas.openxmlformats.org/drawingml/2006/main',
+    'm': 'http://schemas.openxmlformats.org/officeDocument/2006/math',
+    'mv': 'urn:schemas-microsoft-com:mac:vml',
+    'pic': 'http://schemas.openxmlformats.org/drawingml/2006/picture',
+    'v': 'urn:schemas-microsoft-com:vml',
+    'wp': 'http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing',
+    # Properties (core and extended)
+    'cp': 'http://schemas.openxmlformats.org/package/2006/metadata/core-properties',
+    'dc': 'http://purl.org/dc/elements/1.1/',
+    'ep': 'http://schemas.openxmlformats.org/officeDocument/2006/extended-properties',
+    'xsi': 'http://www.w3.org/2001/XMLSchema-instance',
+    # Content Types
+    'ct': 'http://schemas.openxmlformats.org/package/2006/content-types',
+    # Package Relationships
+    'r': 'http://schemas.openxmlformats.org/officeDocument/2006/relationships',
+    'pr': 'http://schemas.openxmlformats.org/package/2006/relationships',
+    # Dublin Core document properties
+    'dcmitype': 'http://purl.org/dc/dcmitype/',
+    'dcterms': 'http://purl.org/dc/terms/'
+}
+

From f7a44c80f8666e1d95661616cec01d1f030c31d8 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Sat, 4 May 2013 07:47:12 +0530
Subject: [PATCH 02/11] ...

---
 src/calibre/ebooks/oeb/reader.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/calibre/ebooks/oeb/reader.py b/src/calibre/ebooks/oeb/reader.py
index 6a3747d2d3..eb7e2eca4c 100644
--- a/src/calibre/ebooks/oeb/reader.py
+++ b/src/calibre/ebooks/oeb/reader.py
@@ -24,6 +24,7 @@ from calibre.ebooks.oeb.base import namespace, barename, XPath, xpath, \
                                     urlnormalize, BINARY_MIME, \
                                     OEBError, OEBBook, DirContainer
 from calibre.ebooks.oeb.writer import OEBWriter
+from calibre.utils.cleantext import clean_xml_chars
 from calibre.utils.localization import get_lang
 from calibre.ptempfile import TemporaryDirectory
 from calibre.constants import __appname__, __version__
@@ -106,7 +107,7 @@ class OEBReader(object):
         try:
             opf = etree.fromstring(data)
         except etree.XMLSyntaxError:
-            data = xml_replace_entities(data, encoding=None)
+            data = xml_replace_entities(clean_xml_chars(data), encoding=None)
             try:
                 opf = etree.fromstring(data)
                 self.logger.warn('OPF contains invalid HTML named entities')

From 654ce41161aa7e38aab28f164ce4f8a943568a76 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Sat, 4 May 2013 08:05:24 +0530
Subject: [PATCH 03/11] Fix error when downloading only covers and reviewing
 downloaded metadata. Fixes #1176253 (Bulk Download Covers Only - 0.9.29)

---
 src/calibre/gui2/actions/edit_metadata.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/calibre/gui2/actions/edit_metadata.py b/src/calibre/gui2/actions/edit_metadata.py
index 19c7ee127e..0fd5d31944 100644
--- a/src/calibre/gui2/actions/edit_metadata.py
+++ b/src/calibre/gui2/actions/edit_metadata.py
@@ -240,9 +240,10 @@ class EditMetadataAction(InterfaceAction):
                         opf, cov = id_map[book_id]
                         cfile = mi.cover
                         mi.cover, mi.cover_data = None, (None, None)
-                        with open(opf, 'wb') as f:
-                            f.write(metadata_to_opf(mi))
-                        if cfile:
+                        if opf is not None:
+                            with open(opf, 'wb') as f:
+                                f.write(metadata_to_opf(mi))
+                        if cfile and cov:
                             shutil.copyfile(cfile, cov)
                             os.remove(cfile)
                     nid_map[book_id] = id_map[book_id]

From 7af7030d983deeee9d99a1ae36bed4d1c7b981e7 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Sat, 4 May 2013 08:53:57 +0530
Subject: [PATCH 04/11] Fix regression that broke deepcopying of Metadata()
 objects

---
 src/calibre/ebooks/metadata/book/formatter.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/src/calibre/ebooks/metadata/book/formatter.py b/src/calibre/ebooks/metadata/book/formatter.py
index 4ddd3d68df..7adbe81016 100644
--- a/src/calibre/ebooks/metadata/book/formatter.py
+++ b/src/calibre/ebooks/metadata/book/formatter.py
@@ -14,16 +14,15 @@ class SafeFormat(TemplateFormatter):
 
     def __init__(self):
         TemplateFormatter.__init__(self)
-        from calibre.ebooks.metadata.book.base import field_metadata
-        self.field_metadata = field_metadata
 
     def get_value(self, orig_key, args, kwargs):
         if not orig_key:
             return ''
         key = orig_key = orig_key.lower()
-        if key != 'title_sort' and key not in TOP_LEVEL_IDENTIFIERS and \
-                key not in ALL_METADATA_FIELDS:
-            key = self.field_metadata.search_term_to_field_key(key)
+        if (key != 'title_sort' and key not in TOP_LEVEL_IDENTIFIERS and
+                key not in ALL_METADATA_FIELDS):
+            from calibre.ebooks.metadata.book.base import field_metadata
+            key = field_metadata.search_term_to_field_key(key)
             if key is None or (self.book and
                                 key not in self.book.all_field_keys()):
                 if hasattr(self.book, orig_key):

From c4361f88c486cbac957fe7c1d5f80853f9bd1ce9 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Sat, 4 May 2013 08:56:09 +0530
Subject: [PATCH 05/11] ...

---
 src/calibre/ebooks/metadata/book/base.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/calibre/ebooks/metadata/book/base.py b/src/calibre/ebooks/metadata/book/base.py
index 2f11ca9326..4104b18a3f 100644
--- a/src/calibre/ebooks/metadata/book/base.py
+++ b/src/calibre/ebooks/metadata/book/base.py
@@ -178,6 +178,8 @@ class Metadata(object):
         return key in object.__getattribute__(self, '_data')
 
     def deepcopy(self):
+        ''' Do not use this method unless you know what you are doing, if you want to create a simple clone of
+        this object, use :method:`deepcopy_metadata` instead. '''
         m = Metadata(None)
         m.__dict__ = copy.deepcopy(self.__dict__)
         object.__setattr__(m, '_data', copy.deepcopy(object.__getattribute__(self, '_data')))

From ecb520cb6ed8238c1edd99325e24c47596dfec64 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Sat, 4 May 2013 10:01:40 +0530
Subject: [PATCH 06/11] Fix regression that caused searching for user
 categories to break. Fixes #1176187 (User Categories:true shows no results)

---
 src/calibre/utils/search_query_parser.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/calibre/utils/search_query_parser.py b/src/calibre/utils/search_query_parser.py
index 47bc902c1c..589aa313f2 100644
--- a/src/calibre/utils/search_query_parser.py
+++ b/src/calibre/utils/search_query_parser.py
@@ -133,6 +133,7 @@ class Parser(object):
         # Had to translate named constants to numeric values
         lex_scanner = re.Scanner([
                 (r'[()]',             lambda x,t: (1, t)),
+                (r'@.+?:[^")\s]+',    lambda x,t: (2, unicode(t))),
                 (r'[^"()\s]+',        lambda x,t: (2, unicode(t))),
                 (r'".*?((?<!\\)")',   lambda x,t: (3, t[1:-1])),
                 (r'\s+',              None)

From df6f0f8dc73f50fafb3e5d44dfdce2e23d0cc10e Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Sat, 4 May 2013 11:06:59 +0530
Subject: [PATCH 07/11] Docx metadata: Read the language of the file, if
 present

---
 src/calibre/ebooks/metadata/docx.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/src/calibre/ebooks/metadata/docx.py b/src/calibre/ebooks/metadata/docx.py
index 1505d397f3..cb265424cc 100644
--- a/src/calibre/ebooks/metadata/docx.py
+++ b/src/calibre/ebooks/metadata/docx.py
@@ -10,6 +10,7 @@ __docformat__ = 'restructuredtext en'
 from lxml import etree
 
 from calibre.ebooks.metadata.book.base import Metadata
+from calibre.utils.localization import canonicalize_lang
 from calibre.utils.zipfile import ZipFile
 from calibre.utils.magick.draw import identify_data
 from calibre.ebooks.oeb.base import DC11_NS
@@ -52,6 +53,15 @@ def _read_doc_props(raw, mi):
         raw = etree.tostring(desc[0], method='text', encoding=unicode)
         mi.comments = raw
 
+    langs = []
+    for lang in XPath('//dc:language')(root):
+        if lang.text and lang.text.strip():
+            l = canonicalize_lang(lang.text)
+            if l:
+                langs.append(l)
+    if langs:
+        mi.languages = langs
+
 def _read_app_props(raw, mi):
     root = etree.fromstring(raw, parser=RECOVER_PARSER)
     company = root.xpath('//*[local-name()="Company"]')

From 22f95c8678b130b54a9898fad009f0a103012afc Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Sat, 4 May 2013 12:32:06 +0530
Subject: [PATCH 08/11] Refactor DOCX metadata reading to use the container
 class

---
 src/calibre/ebooks/docx/container.py | 123 ++++++++++++++++++++++++---
 src/calibre/ebooks/docx/names.py     |   7 ++
 src/calibre/ebooks/docx/to_html.py   |  41 +++++++++
 src/calibre/ebooks/metadata/docx.py  |  73 ++--------------
 4 files changed, 168 insertions(+), 76 deletions(-)
 create mode 100644 src/calibre/ebooks/docx/to_html.py

diff --git a/src/calibre/ebooks/docx/container.py b/src/calibre/ebooks/docx/container.py
index efbe7b8fcb..cae22e086c 100644
--- a/src/calibre/ebooks/docx/container.py
+++ b/src/calibre/ebooks/docx/container.py
@@ -6,30 +6,90 @@ from __future__ import (unicode_literals, division, absolute_import,
 __license__ = 'GPL v3'
 __copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
 
-import os, sys
+import os, sys, shutil
 
 from lxml import etree
 
 from calibre import walk, guess_type
+from calibre.ebooks.metadata import string_to_authors
+from calibre.ebooks.metadata.book.base import Metadata
 from calibre.ebooks.docx import InvalidDOCX
-from calibre.ebooks.docx.names import DOCUMENT
+from calibre.ebooks.docx.names import DOCUMENT, DOCPROPS, XPath, APPPROPS
 from calibre.ptempfile import PersistentTemporaryDirectory
+from calibre.utils.localization import canonicalize_lang
 from calibre.utils.logging import default_log
 from calibre.utils.zipfile import ZipFile
+from calibre.ebooks.oeb.parse_utils import RECOVER_PARSER
+
+def fromstring(raw, parser=RECOVER_PARSER):
+    return etree.fromstring(raw, parser=parser)
+
+# Read metadata {{{
+def read_doc_props(raw, mi):
+    root = fromstring(raw)
+    titles = XPath('//dc:title')(root)
+    if titles:
+        title = titles[0].text
+        if title and title.strip():
+            mi.title = title.strip()
+    tags = []
+    for subject in XPath('//dc:subject')(root):
+        if subject.text and subject.text.strip():
+            tags.append(subject.text.strip().replace(',', '_'))
+    for keywords in XPath('//cp:keywords')(root):
+        if keywords.text and keywords.text.strip():
+            for x in keywords.text.split():
+                tags.extend(y.strip() for y in x.split(','))
+    if tags:
+        mi.tags = tags
+    authors = XPath('//dc:creator')(root)
+    aut = []
+    for author in authors:
+        if author.text and author.text.strip():
+            aut.extend(string_to_authors(author.text))
+    if aut:
+        mi.authors = aut
+
+    desc = XPath('//dc:description')(root)
+    if desc:
+        raw = etree.tostring(desc[0], method='text', encoding=unicode)
+        mi.comments = raw
+
+    langs = []
+    for lang in XPath('//dc:language')(root):
+        if lang.text and lang.text.strip():
+            l = canonicalize_lang(lang.text)
+            if l:
+                langs.append(l)
+    if langs:
+        mi.languages = langs
+
+def read_app_props(raw, mi):
+    root = fromstring(raw)
+    company = root.xpath('//*[local-name()="Company"]')
+    if company and company[0].text and company[0].text.strip():
+        mi.publisher = company[0].text.strip()
+# }}}
 
 class DOCX(object):
 
-    def __init__(self, path_or_stream, log=None):
+    def __init__(self, path_or_stream, log=None, extract=True):
         stream = path_or_stream if hasattr(path_or_stream, 'read') else open(path_or_stream, 'rb')
         self.name = getattr(stream, 'name', None) or '<stream>'
         self.log = log or default_log
-        self.tdir = PersistentTemporaryDirectory('docx_container')
-
-        self.extract(stream)
+        if extract:
+            self.extract(stream)
+        else:
+            self.init_zipfile(stream)
         self.read_content_types()
         self.read_package_relationships()
 
+    def init_zipfile(self, stream):
+        self.zipf = ZipFile(stream)
+        self.names = frozenset(self.zipf.namelist())
+
     def extract(self, stream):
+        self.tdir = PersistentTemporaryDirectory('docx_container')
         try:
             zf = ZipFile(stream)
             zf.extractall(self.tdir)
@@ -46,6 +106,8 @@ class DOCX(object):
             self.names[name] = f
 
     def read(self, name):
+        if hasattr(self, 'zipf'):
+            return self.zipf.open(name).read()
         path = self.names[name]
         with open(path, 'rb') as f:
             return f.read()
@@ -55,7 +117,7 @@ class DOCX(object):
             raw = self.read('[Content_Types].xml')
         except KeyError:
             raise InvalidDOCX('The file %s docx file has no [Content_Types].xml' % self.name)
-        root = etree.fromstring(raw)
+        root = fromstring(raw)
         self.content_types = {}
         self.default_content_types = {}
         for item in root.xpath('//*[local-name()="Types"]/*[local-name()="Default" and @Extension and @ContentType]'):
@@ -77,7 +139,7 @@ class DOCX(object):
             raw = self.read('_rels/.rels')
         except KeyError:
             raise InvalidDOCX('The file %s docx file has no _rels/.rels' % self.name)
-        root = etree.fromstring(raw)
+        root = fromstring(raw)
         self.relationships = {}
         self.relationships_rmap = {}
         for item in root.xpath('//*[local-name()="Relationships"]/*[local-name()="Relationship" and @Type and @Target]'):
@@ -94,7 +156,48 @@ class DOCX(object):
             if not names:
                 raise InvalidDOCX('The file %s docx file has no main document' % self.name)
             name = names[0]
-        return etree.fromstring(self.read(name))
+        return fromstring(self.read(name))
+
+    @property
+    def metadata(self):
+        mi = Metadata(_('Unknown'))
+        name = self.relationships.get(DOCPROPS, None)
+        if name is None:
+            names = tuple(n for n in self.names if n.lower() == 'docprops/core.xml')
+            if names:
+                name = names[0]
+        if name:
+            try:
+                raw = self.read(name)
+            except KeyError:
+                pass
+            else:
+                read_doc_props(raw, mi)
+
+        name = self.relationships.get(APPPROPS, None)
+        if name is None:
+            names = tuple(n for n in self.names if n.lower() == 'docprops/app.xml')
+            if names:
+                name = names[0]
+        if name:
+            try:
+                raw = self.read(name)
+            except KeyError:
+                pass
+            else:
+                read_app_props(raw, mi)
+
+        return mi
+
+    def close(self):
+        if hasattr(self, 'zipf'):
+            self.zipf.close()
+        else:
+            try:
+                shutil.rmtree(self.tdir)
+            except EnvironmentError:
+                pass
 
 if __name__ == '__main__':
-    d = DOCX(sys.argv[-1])
+    d = DOCX(sys.argv[-1], extract=False)
+    print (d.metadata)
diff --git a/src/calibre/ebooks/docx/names.py b/src/calibre/ebooks/docx/names.py
index 0a31d08ab7..9080377b36 100644
--- a/src/calibre/ebooks/docx/names.py
+++ b/src/calibre/ebooks/docx/names.py
@@ -6,7 +6,11 @@ from __future__ import (unicode_literals, division, absolute_import,
 __license__ = 'GPL v3'
 __copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
 
+from lxml.etree import XPath as X
+
 DOCUMENT = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument'
+DOCPROPS = 'http://schemas.openxmlformats.org/package/2006/relationships/metadata/core-properties'
+APPPROPS = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/extended-properties'
 
 namespaces = {
     'mo': 'http://schemas.microsoft.com/office/mac/office/2008/main',
@@ -38,3 +42,6 @@ namespaces = {
     'dcterms': 'http://purl.org/dc/terms/'
 }
 
+def XPath(expr):
+    return X(expr, namespaces=namespaces)
+
diff --git a/src/calibre/ebooks/docx/to_html.py b/src/calibre/ebooks/docx/to_html.py
new file mode 100644
index 0000000000..b2a5de4691
--- /dev/null
+++ b/src/calibre/ebooks/docx/to_html.py
@@ -0,0 +1,41 @@
+#!/usr/bin/env python
+# vim:fileencoding=utf-8
+from __future__ import (unicode_literals, division, absolute_import,
+                        print_function)
+
+__license__ = 'GPL v3'
+__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
+
+import sys, os
+
+from lxml import html
+from lxml.html.builder import (HTML, HEAD, TITLE, BODY, LINK, META)
+
+from calibre.ebooks.docx.container import Container
+
+class Convert(object):
+
+    def __init__(self, path_or_stream, dest_dir=None, log=None):
+        self.container = Container(path_or_stream, log=log)
+        self.log = self.container.log
+        self.dest_dir = dest_dir or os.getcwdu()
+        self.body = BODY()
+        self.html = HTML(
+            HEAD(
+                META(charset='utf-8'),
+                TITLE('TODO: read from metadata'),
+                LINK(rel='stylesheet', type='text/css', href='docx.css'),
+            ),
+            self.body
+        )
+
+    def __call__(self):
+        self.write()
+
+    def write(self):
+        raw = html.tostring(self.html, encoding='utf-8', doctype='<!DOCTYPE html>')
+        with open(os.path.join(self.dest_dir, 'index.html'), 'wb') as f:
+            f.write(raw)
+
+if __name__ == '__main__':
+    Convert(sys.argv[-1])()
diff --git a/src/calibre/ebooks/metadata/docx.py b/src/calibre/ebooks/metadata/docx.py
index cb265424cc..31b0c48974 100644
--- a/src/calibre/ebooks/metadata/docx.py
+++ b/src/calibre/ebooks/metadata/docx.py
@@ -7,80 +7,21 @@ __license__   = 'GPL v3'
 __copyright__ = '2012, Kovid Goyal <kovid at kovidgoyal.net>'
 __docformat__ = 'restructuredtext en'
 
-from lxml import etree
+from calibre.ebooks.docx.container import DOCX
 
-from calibre.ebooks.metadata.book.base import Metadata
-from calibre.utils.localization import canonicalize_lang
 from calibre.utils.zipfile import ZipFile
 from calibre.utils.magick.draw import identify_data
-from calibre.ebooks.oeb.base import DC11_NS
-from calibre.ebooks.oeb.parse_utils import RECOVER_PARSER
-
-NSMAP = {'dc':DC11_NS,
-'cp':'http://schemas.openxmlformats.org/package/2006/metadata/core-properties'}
-
-def XPath(expr):
-    return etree.XPath(expr, namespaces=NSMAP)
-
-def _read_doc_props(raw, mi):
-    from calibre.ebooks.metadata import string_to_authors
-    root = etree.fromstring(raw, parser=RECOVER_PARSER)
-    titles = XPath('//dc:title')(root)
-    if titles:
-        title = titles[0].text
-        if title and title.strip():
-            mi.title = title.strip()
-    tags = []
-    for subject in XPath('//dc:subject')(root):
-        if subject.text and subject.text.strip():
-            tags.append(subject.text.strip().replace(',', '_'))
-    for keywords in XPath('//cp:keywords')(root):
-        if keywords.text and keywords.text.strip():
-            for x in keywords.text.split():
-                tags.extend(y.strip() for y in x.split(','))
-    if tags:
-        mi.tags = tags
-    authors = XPath('//dc:creator')(root)
-    aut = []
-    for author in authors:
-        if author.text and author.text.strip():
-            aut.extend(string_to_authors(author.text))
-    if aut:
-        mi.authors = aut
-
-    desc = XPath('//dc:description')(root)
-    if desc:
-        raw = etree.tostring(desc[0], method='text', encoding=unicode)
-        mi.comments = raw
-
-    langs = []
-    for lang in XPath('//dc:language')(root):
-        if lang.text and lang.text.strip():
-            l = canonicalize_lang(lang.text)
-            if l:
-                langs.append(l)
-    if langs:
-        mi.languages = langs
-
-def _read_app_props(raw, mi):
-    root = etree.fromstring(raw, parser=RECOVER_PARSER)
-    company = root.xpath('//*[local-name()="Company"]')
-    if company and company[0].text and company[0].text.strip():
-        mi.publisher = company[0].text.strip()
 
 def get_metadata(stream):
+    c = DOCX(stream, extract=False)
+    mi = c.metadata
+    c.close()
+    stream.seek(0)
+    cdata = None
     with ZipFile(stream, 'r') as zf:
-
-        mi = Metadata(_('Unknown'))
-        cdata = None
-
         for zi in zf.infolist():
             ext = zi.filename.rpartition('.')[-1].lower()
-            if zi.filename.lower() == 'docprops/core.xml':
-                _read_doc_props(zf.read(zi), mi)
-            elif zi.filename.lower() == 'docprops/app.xml':
-                _read_app_props(zf.read(zi), mi)
-            elif cdata is None and ext in {'jpeg', 'jpg', 'png', 'gif'}:
+            if cdata is None and ext in {'jpeg', 'jpg', 'png', 'gif'}:
                 raw = zf.read(zi)
                 try:
                     width, height, fmt = identify_data(raw)

From 4037971bded2766c26b31d23a12b3eaa36040c76 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Sat, 4 May 2013 14:16:17 +0530
Subject: [PATCH 09/11] Update The Sun

---
 recipes/the_sun.recipe | 58 ++++++++++++++----------------------------
 1 file changed, 19 insertions(+), 39 deletions(-)

diff --git a/recipes/the_sun.recipe b/recipes/the_sun.recipe
index 10027d4583..962aa67b91 100644
--- a/recipes/the_sun.recipe
+++ b/recipes/the_sun.recipe
@@ -1,4 +1,4 @@
-import re, random
+import random
 
 from calibre import browser
 from calibre.web.feeds.recipes import BasicNewsRecipe
@@ -8,7 +8,7 @@ class AdvancedUserRecipe1325006965(BasicNewsRecipe):
     title          = u'The Sun UK'
     description = 'Articles from The Sun tabloid UK'
     __author__ = 'Dave Asbury'
-    # last updated 19/10/12 better cover fetch
+    # last updated 5/5/13 better cover fetch
     language = 'en_GB'
     oldest_article = 1
     max_articles_per_feed = 15
@@ -29,16 +29,12 @@ class AdvancedUserRecipe1325006965(BasicNewsRecipe):
         dict(name='div',attrs={'class' : 'intro'}),
                                 dict(name='h3'),
         dict(name='div',attrs={'id' : 'articlebody'}),
-           #dict(attrs={'class' : ['right_col_branding','related-stories','mystery-meat-link','ltbx-container','ltbx-var ltbx-hbxpn','ltbx-var ltbx-nav-loop','ltbx-var ltbx-url']}),
-           #                dict(name='div',attrs={'class' : 'cf'}),
-          # dict(attrs={'title' : 'download flash'}),
-          #                 dict(attrs={'style' : 'padding: 5px'})
 
-           ]
+    ]
     remove_tags_after = [dict(id='bodyText')]
     remove_tags=[
-                  dict(name='li'),
-                              dict(attrs={'class' : 'grid-4 right-hand-column'}),
+                    dict(name='li'),
+                    dict(attrs={'class' : 'grid-4 right-hand-column'}),
         ]
 
     feeds          = [
@@ -47,40 +43,24 @@ class AdvancedUserRecipe1325006965(BasicNewsRecipe):
     (u'Showbiz', u'http://www.thesun.co.uk/sol/homepage/showbiz/rss'),
     (u'Woman', u'http://www.thesun.co.uk/sol/homepage/woman/rss'),
     ]
-# starsons code
-    def parse_feeds (self):
-      feeds = BasicNewsRecipe.parse_feeds(self)
-      for feed in feeds:
-        for article in feed.articles[:]:
-          print 'article.title is: ', article.title
-          if 'Try out The Sun' in article.title.upper() or 'Try-out-The-Suns' in article.url:
-            feed.articles.remove(article)
-          if 'Web porn harms kids' in article.title.upper() or 'Sun-says-Web-porn' in article.url:
-            feed.articles.remove(article)
-      return feeds
+    # starsons code
+    def parse_feeds(self):
+        feeds = BasicNewsRecipe.parse_feeds(self)
+        for feed in feeds:
+            for article in feed.articles[:]:
+                if 'Try out The Sun' in article.title.upper() or 'Try-out-The-Suns' in article.url:
+                    feed.articles.remove(article)
+                if 'Web porn harms kids' in article.title.upper() or 'Sun-says-Web-porn' in article.url:
+                    feed.articles.remove(article)
+        return feeds
 
     def get_cover_url(self):
-        soup = self.index_to_soup('http://www.politicshome.com/uk/latest_frontpage.html')
-        # look for the block containing the sun button and url
-        cov = soup.find(attrs={'style' : 'background-image: url(http://www.politicshome.com/images/sources/source_frontpage_button_84.gif);'})
-
-        #cov = soup.find(attrs={'id' : 'large'})
-        cov2 = str(cov)
-
-        cov2='http://www.politicshome.com'+cov2[9:-133]
-        #cov2 now contains url of the page containing pic
-        #cov2 now contains url of the page containing pic
-        soup = self.index_to_soup(cov2)
-        cov = soup.find(attrs={'id' : 'large'})
-        cov=str(cov)
-        cov2 =  re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', cov)
-        cov2 = str(cov2)
-        cov2=cov2[2:len(cov2)-2]
         br = browser()
         br.set_handle_redirect(False)
+        cover_url = 'http://www.thepaperboy.com/frontpages/current/The_Sun_newspaper_front_page.jpg'
+
         try:
-            br.open_novisit(cov2)
-            cover_url = cov2
+            br.open_novisit('http://www.thepaperboy.com/frontpages/current/The_Sun_newspaper_front_page.jpg')
         except:
             cover_url = random.choice([
                 'http://img.thesun.co.uk/multimedia/archive/00905/errorpage6_677961a_905507a.jpg'
@@ -88,6 +68,6 @@ class AdvancedUserRecipe1325006965(BasicNewsRecipe):
                 ,'http://img.thesun.co.uk/multimedia/archive/00905/errorpage5_677960a_905512a.jpg'
                 ,'http://img.thesun.co.uk/multimedia/archive/00905/errorpage2_677957a_905502a.jpg'
                 ,'http://img.thesun.co.uk/multimedia/archive/00905/errorpage3_677958a_905503a.jpg'
-                ])
+            ])
 
         return cover_url

From d6a8e92dcdcd7c9405ccb382984190b0c35c5d0f Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Sat, 4 May 2013 14:24:10 +0530
Subject: [PATCH 10/11] When changing to a virtual library, refresh the Book
 Details panel. Fixes #1176296 (Virtual Library - Wrong Book Highlighted When
 Switching Libraries)

---
 src/calibre/gui2/search_restriction_mixin.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/calibre/gui2/search_restriction_mixin.py b/src/calibre/gui2/search_restriction_mixin.py
index f3055341bb..1a9ea621a9 100644
--- a/src/calibre/gui2/search_restriction_mixin.py
+++ b/src/calibre/gui2/search_restriction_mixin.py
@@ -561,6 +561,10 @@ class SearchRestrictionMixin(object):
         self.set_number_of_books_shown()
         self.current_view().setFocus(Qt.OtherFocusReason)
         self.set_window_title()
+        v = self.current_view()
+        if not v.currentIndex().isValid():
+            v.set_current_row()
+        v.refresh_book_details()
 
     def set_number_of_books_shown(self):
         db = self.library_view.model().db

From e7268bc39fc3e3576e3a5c23a3dda32583aadd5d Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Sat, 4 May 2013 14:30:43 +0530
Subject: [PATCH 11/11] Add keyboard shortcut to clear additional restriction

---
 manual/gui.rst                               | 2 ++
 src/calibre/gui2/search_restriction_mixin.py | 3 +++
 src/calibre/gui2/ui.py                       | 7 +++++++
 3 files changed, 12 insertions(+)

diff --git a/manual/gui.rst b/manual/gui.rst
index 6d75f65542..f973008ffd 100755
--- a/manual/gui.rst
+++ b/manual/gui.rst
@@ -586,6 +586,8 @@ Calibre has several keyboard shortcuts to save you time and mouse movement. Thes
       - Focus the book list
     * - :kbd:`Ctrl+Esc`
       - Clear the virtual library
+    * - :kbd:`Alt+Esc`
+      - Clear the additional restriction
     * - :kbd:`N or F3`
       - Find the next book that matches the current search (only works if the highlight checkbox next to the search bar is checked)
     * - :kbd:`Shift+N or Shift+F3`
diff --git a/src/calibre/gui2/search_restriction_mixin.py b/src/calibre/gui2/search_restriction_mixin.py
index 1a9ea621a9..c6965aaa6c 100644
--- a/src/calibre/gui2/search_restriction_mixin.py
+++ b/src/calibre/gui2/search_restriction_mixin.py
@@ -549,6 +549,9 @@ class SearchRestrictionMixin(object):
                 restriction = ''
             self._apply_search_restriction(restriction, r)
 
+    def clear_additional_restriction(self):
+        self._apply_search_restriction('', '')
+
     def _apply_search_restriction(self, restriction, name):
         self.saved_search.clear()
         # The order below is important. Set the restriction, force a '' search
diff --git a/src/calibre/gui2/ui.py b/src/calibre/gui2/ui.py
index 8a5f1ffbb5..aafea4ef2b 100644
--- a/src/calibre/gui2/ui.py
+++ b/src/calibre/gui2/ui.py
@@ -279,6 +279,13 @@ class Main(MainWindow, MainWindowMixin, DeviceMixin, EmailMixin,  # {{{
                 action=self.ctrl_esc_action)
         self.ctrl_esc_action.triggered.connect(self.ctrl_esc)
 
+        self.alt_esc_action = QAction(self)
+        self.addAction(self.alt_esc_action)
+        self.keyboard.register_shortcut('clear additional restriction',
+                _('Clear the additional restriction'), default_keys=('Alt+Esc',),
+                action=self.alt_esc_action)
+        self.alt_esc_action.triggered.connect(self.clear_additional_restriction)
+
         ####################### Start spare job server ########################
         QTimer.singleShot(1000, self.add_spare_server)