From 8f08d9446dc5578f558d1e74c94e019c51cb7961 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 4 May 2013 07:46:41 +0530 Subject: [PATCH 01/11] Start work on docx input plugin --- src/calibre/ebooks/docx/__init__.py | 11 +++ src/calibre/ebooks/docx/container.py | 100 +++++++++++++++++++++++++++ src/calibre/ebooks/docx/names.py | 40 +++++++++++ 3 files changed, 151 insertions(+) create mode 100644 src/calibre/ebooks/docx/__init__.py create mode 100644 src/calibre/ebooks/docx/container.py create mode 100644 src/calibre/ebooks/docx/names.py diff --git a/src/calibre/ebooks/docx/__init__.py b/src/calibre/ebooks/docx/__init__.py new file mode 100644 index 0000000000..f8bda2506d --- /dev/null +++ b/src/calibre/ebooks/docx/__init__.py @@ -0,0 +1,11 @@ +#!/usr/bin/env python +# vim:fileencoding=utf-8 +from __future__ import (unicode_literals, division, absolute_import, + print_function) + +__license__ = 'GPL v3' +__copyright__ = '2013, Kovid Goyal ' + +class InvalidDOCX(ValueError): + pass + diff --git a/src/calibre/ebooks/docx/container.py b/src/calibre/ebooks/docx/container.py new file mode 100644 index 0000000000..efbe7b8fcb --- /dev/null +++ b/src/calibre/ebooks/docx/container.py @@ -0,0 +1,100 @@ +#!/usr/bin/env python +# vim:fileencoding=utf-8 +from __future__ import (unicode_literals, division, absolute_import, + print_function) + +__license__ = 'GPL v3' +__copyright__ = '2013, Kovid Goyal ' + +import os, sys + +from lxml import etree + +from calibre import walk, guess_type +from calibre.ebooks.docx import InvalidDOCX +from calibre.ebooks.docx.names import DOCUMENT +from calibre.ptempfile import PersistentTemporaryDirectory +from calibre.utils.logging import default_log +from calibre.utils.zipfile import ZipFile + +class DOCX(object): + + def __init__(self, path_or_stream, log=None): + stream = path_or_stream if hasattr(path_or_stream, 'read') else open(path_or_stream, 'rb') + self.name = getattr(stream, 'name', None) or '' + self.log = log or default_log + self.tdir = PersistentTemporaryDirectory('docx_container') + + self.extract(stream) + self.read_content_types() + self.read_package_relationships() + + def extract(self, stream): + try: + zf = ZipFile(stream) + zf.extractall(self.tdir) + except: + self.log.exception('DOCX appears to be invalid ZIP file, trying a' + ' more forgiving ZIP parser') + from calibre.utils.localunzip import extractall + stream.seek(0) + extractall(stream, self.tdir) + + self.names = {} + for f in walk(self.tdir): + name = os.path.relpath(f, self.tdir).replace(os.sep, '/') + self.names[name] = f + + def read(self, name): + path = self.names[name] + with open(path, 'rb') as f: + return f.read() + + def read_content_types(self): + try: + raw = self.read('[Content_Types].xml') + except KeyError: + raise InvalidDOCX('The file %s docx file has no [Content_Types].xml' % self.name) + root = etree.fromstring(raw) + self.content_types = {} + self.default_content_types = {} + for item in root.xpath('//*[local-name()="Types"]/*[local-name()="Default" and @Extension and @ContentType]'): + self.default_content_types[item.get('Extension').lower()] = item.get('ContentType') + for item in root.xpath('//*[local-name()="Types"]/*[local-name()="Override" and @PartName and @ContentType]'): + name = item.get('PartName').lstrip('/') + self.content_types[name] = item.get('ContentType') + + def content_type(self, name): + if name in self.content_types: + return self.content_types[name] + ext = name.rpartition('.')[-1].lower() + if ext in self.default_content_types: + return self.default_content_types[ext] + return guess_type(name)[0] + + def read_package_relationships(self): + try: + raw = self.read('_rels/.rels') + except KeyError: + raise InvalidDOCX('The file %s docx file has no _rels/.rels' % self.name) + root = etree.fromstring(raw) + self.relationships = {} + self.relationships_rmap = {} + for item in root.xpath('//*[local-name()="Relationships"]/*[local-name()="Relationship" and @Type and @Target]'): + target = item.get('Target').lstrip('/') + typ = item.get('Type') + self.relationships[typ] = target + self.relationships_rmap[target] = typ + + @property + def document(self): + name = self.relationships.get(DOCUMENT, None) + if name is None: + names = tuple(n for n in self.names if n == 'document.xml' or n.endswith('/document.xml')) + if not names: + raise InvalidDOCX('The file %s docx file has no main document' % self.name) + name = names[0] + return etree.fromstring(self.read(name)) + +if __name__ == '__main__': + d = DOCX(sys.argv[-1]) diff --git a/src/calibre/ebooks/docx/names.py b/src/calibre/ebooks/docx/names.py new file mode 100644 index 0000000000..0a31d08ab7 --- /dev/null +++ b/src/calibre/ebooks/docx/names.py @@ -0,0 +1,40 @@ +#!/usr/bin/env python +# vim:fileencoding=utf-8 +from __future__ import (unicode_literals, division, absolute_import, + print_function) + +__license__ = 'GPL v3' +__copyright__ = '2013, Kovid Goyal ' + +DOCUMENT = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument' + +namespaces = { + 'mo': 'http://schemas.microsoft.com/office/mac/office/2008/main', + 'o': 'urn:schemas-microsoft-com:office:office', + 've': 'http://schemas.openxmlformats.org/markup-compatibility/2006', + # Text Content + 'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main', + 'w10': 'urn:schemas-microsoft-com:office:word', + 'wne': 'http://schemas.microsoft.com/office/word/2006/wordml', + # Drawing + 'a': 'http://schemas.openxmlformats.org/drawingml/2006/main', + 'm': 'http://schemas.openxmlformats.org/officeDocument/2006/math', + 'mv': 'urn:schemas-microsoft-com:mac:vml', + 'pic': 'http://schemas.openxmlformats.org/drawingml/2006/picture', + 'v': 'urn:schemas-microsoft-com:vml', + 'wp': 'http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing', + # Properties (core and extended) + 'cp': 'http://schemas.openxmlformats.org/package/2006/metadata/core-properties', + 'dc': 'http://purl.org/dc/elements/1.1/', + 'ep': 'http://schemas.openxmlformats.org/officeDocument/2006/extended-properties', + 'xsi': 'http://www.w3.org/2001/XMLSchema-instance', + # Content Types + 'ct': 'http://schemas.openxmlformats.org/package/2006/content-types', + # Package Relationships + 'r': 'http://schemas.openxmlformats.org/officeDocument/2006/relationships', + 'pr': 'http://schemas.openxmlformats.org/package/2006/relationships', + # Dublin Core document properties + 'dcmitype': 'http://purl.org/dc/dcmitype/', + 'dcterms': 'http://purl.org/dc/terms/' +} + From f7a44c80f8666e1d95661616cec01d1f030c31d8 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 4 May 2013 07:47:12 +0530 Subject: [PATCH 02/11] ... --- src/calibre/ebooks/oeb/reader.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/calibre/ebooks/oeb/reader.py b/src/calibre/ebooks/oeb/reader.py index 6a3747d2d3..eb7e2eca4c 100644 --- a/src/calibre/ebooks/oeb/reader.py +++ b/src/calibre/ebooks/oeb/reader.py @@ -24,6 +24,7 @@ from calibre.ebooks.oeb.base import namespace, barename, XPath, xpath, \ urlnormalize, BINARY_MIME, \ OEBError, OEBBook, DirContainer from calibre.ebooks.oeb.writer import OEBWriter +from calibre.utils.cleantext import clean_xml_chars from calibre.utils.localization import get_lang from calibre.ptempfile import TemporaryDirectory from calibre.constants import __appname__, __version__ @@ -106,7 +107,7 @@ class OEBReader(object): try: opf = etree.fromstring(data) except etree.XMLSyntaxError: - data = xml_replace_entities(data, encoding=None) + data = xml_replace_entities(clean_xml_chars(data), encoding=None) try: opf = etree.fromstring(data) self.logger.warn('OPF contains invalid HTML named entities') From 654ce41161aa7e38aab28f164ce4f8a943568a76 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 4 May 2013 08:05:24 +0530 Subject: [PATCH 03/11] Fix error when downloading only covers and reviewing downloaded metadata. Fixes #1176253 (Bulk Download Covers Only - 0.9.29) --- src/calibre/gui2/actions/edit_metadata.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/calibre/gui2/actions/edit_metadata.py b/src/calibre/gui2/actions/edit_metadata.py index 19c7ee127e..0fd5d31944 100644 --- a/src/calibre/gui2/actions/edit_metadata.py +++ b/src/calibre/gui2/actions/edit_metadata.py @@ -240,9 +240,10 @@ class EditMetadataAction(InterfaceAction): opf, cov = id_map[book_id] cfile = mi.cover mi.cover, mi.cover_data = None, (None, None) - with open(opf, 'wb') as f: - f.write(metadata_to_opf(mi)) - if cfile: + if opf is not None: + with open(opf, 'wb') as f: + f.write(metadata_to_opf(mi)) + if cfile and cov: shutil.copyfile(cfile, cov) os.remove(cfile) nid_map[book_id] = id_map[book_id] From 7af7030d983deeee9d99a1ae36bed4d1c7b981e7 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 4 May 2013 08:53:57 +0530 Subject: [PATCH 04/11] Fix regression that broke deepcopying of Metadata() objects --- src/calibre/ebooks/metadata/book/formatter.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/src/calibre/ebooks/metadata/book/formatter.py b/src/calibre/ebooks/metadata/book/formatter.py index 4ddd3d68df..7adbe81016 100644 --- a/src/calibre/ebooks/metadata/book/formatter.py +++ b/src/calibre/ebooks/metadata/book/formatter.py @@ -14,16 +14,15 @@ class SafeFormat(TemplateFormatter): def __init__(self): TemplateFormatter.__init__(self) - from calibre.ebooks.metadata.book.base import field_metadata - self.field_metadata = field_metadata def get_value(self, orig_key, args, kwargs): if not orig_key: return '' key = orig_key = orig_key.lower() - if key != 'title_sort' and key not in TOP_LEVEL_IDENTIFIERS and \ - key not in ALL_METADATA_FIELDS: - key = self.field_metadata.search_term_to_field_key(key) + if (key != 'title_sort' and key not in TOP_LEVEL_IDENTIFIERS and + key not in ALL_METADATA_FIELDS): + from calibre.ebooks.metadata.book.base import field_metadata + key = field_metadata.search_term_to_field_key(key) if key is None or (self.book and key not in self.book.all_field_keys()): if hasattr(self.book, orig_key): From c4361f88c486cbac957fe7c1d5f80853f9bd1ce9 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 4 May 2013 08:56:09 +0530 Subject: [PATCH 05/11] ... --- src/calibre/ebooks/metadata/book/base.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/calibre/ebooks/metadata/book/base.py b/src/calibre/ebooks/metadata/book/base.py index 2f11ca9326..4104b18a3f 100644 --- a/src/calibre/ebooks/metadata/book/base.py +++ b/src/calibre/ebooks/metadata/book/base.py @@ -178,6 +178,8 @@ class Metadata(object): return key in object.__getattribute__(self, '_data') def deepcopy(self): + ''' Do not use this method unless you know what you are doing, if you want to create a simple clone of + this object, use :method:`deepcopy_metadata` instead. ''' m = Metadata(None) m.__dict__ = copy.deepcopy(self.__dict__) object.__setattr__(m, '_data', copy.deepcopy(object.__getattribute__(self, '_data'))) From ecb520cb6ed8238c1edd99325e24c47596dfec64 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 4 May 2013 10:01:40 +0530 Subject: [PATCH 06/11] Fix regression that caused searching for user categories to break. Fixes #1176187 (User Categories:true shows no results) --- src/calibre/utils/search_query_parser.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/calibre/utils/search_query_parser.py b/src/calibre/utils/search_query_parser.py index 47bc902c1c..589aa313f2 100644 --- a/src/calibre/utils/search_query_parser.py +++ b/src/calibre/utils/search_query_parser.py @@ -133,6 +133,7 @@ class Parser(object): # Had to translate named constants to numeric values lex_scanner = re.Scanner([ (r'[()]', lambda x,t: (1, t)), + (r'@.+?:[^")\s]+', lambda x,t: (2, unicode(t))), (r'[^"()\s]+', lambda x,t: (2, unicode(t))), (r'".*?((? Date: Sat, 4 May 2013 11:06:59 +0530 Subject: [PATCH 07/11] Docx metadata: Read the language of the file, if present --- src/calibre/ebooks/metadata/docx.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/calibre/ebooks/metadata/docx.py b/src/calibre/ebooks/metadata/docx.py index 1505d397f3..cb265424cc 100644 --- a/src/calibre/ebooks/metadata/docx.py +++ b/src/calibre/ebooks/metadata/docx.py @@ -10,6 +10,7 @@ __docformat__ = 'restructuredtext en' from lxml import etree from calibre.ebooks.metadata.book.base import Metadata +from calibre.utils.localization import canonicalize_lang from calibre.utils.zipfile import ZipFile from calibre.utils.magick.draw import identify_data from calibre.ebooks.oeb.base import DC11_NS @@ -52,6 +53,15 @@ def _read_doc_props(raw, mi): raw = etree.tostring(desc[0], method='text', encoding=unicode) mi.comments = raw + langs = [] + for lang in XPath('//dc:language')(root): + if lang.text and lang.text.strip(): + l = canonicalize_lang(lang.text) + if l: + langs.append(l) + if langs: + mi.languages = langs + def _read_app_props(raw, mi): root = etree.fromstring(raw, parser=RECOVER_PARSER) company = root.xpath('//*[local-name()="Company"]') From 22f95c8678b130b54a9898fad009f0a103012afc Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 4 May 2013 12:32:06 +0530 Subject: [PATCH 08/11] Refactor DOCX metadata reading to use the container class --- src/calibre/ebooks/docx/container.py | 123 ++++++++++++++++++++++++--- src/calibre/ebooks/docx/names.py | 7 ++ src/calibre/ebooks/docx/to_html.py | 41 +++++++++ src/calibre/ebooks/metadata/docx.py | 73 ++-------------- 4 files changed, 168 insertions(+), 76 deletions(-) create mode 100644 src/calibre/ebooks/docx/to_html.py diff --git a/src/calibre/ebooks/docx/container.py b/src/calibre/ebooks/docx/container.py index efbe7b8fcb..cae22e086c 100644 --- a/src/calibre/ebooks/docx/container.py +++ b/src/calibre/ebooks/docx/container.py @@ -6,30 +6,90 @@ from __future__ import (unicode_literals, division, absolute_import, __license__ = 'GPL v3' __copyright__ = '2013, Kovid Goyal ' -import os, sys +import os, sys, shutil from lxml import etree from calibre import walk, guess_type +from calibre.ebooks.metadata import string_to_authors +from calibre.ebooks.metadata.book.base import Metadata from calibre.ebooks.docx import InvalidDOCX -from calibre.ebooks.docx.names import DOCUMENT +from calibre.ebooks.docx.names import DOCUMENT, DOCPROPS, XPath, APPPROPS from calibre.ptempfile import PersistentTemporaryDirectory +from calibre.utils.localization import canonicalize_lang from calibre.utils.logging import default_log from calibre.utils.zipfile import ZipFile +from calibre.ebooks.oeb.parse_utils import RECOVER_PARSER + +def fromstring(raw, parser=RECOVER_PARSER): + return etree.fromstring(raw, parser=parser) + +# Read metadata {{{ +def read_doc_props(raw, mi): + root = fromstring(raw) + titles = XPath('//dc:title')(root) + if titles: + title = titles[0].text + if title and title.strip(): + mi.title = title.strip() + tags = [] + for subject in XPath('//dc:subject')(root): + if subject.text and subject.text.strip(): + tags.append(subject.text.strip().replace(',', '_')) + for keywords in XPath('//cp:keywords')(root): + if keywords.text and keywords.text.strip(): + for x in keywords.text.split(): + tags.extend(y.strip() for y in x.split(',')) + if tags: + mi.tags = tags + authors = XPath('//dc:creator')(root) + aut = [] + for author in authors: + if author.text and author.text.strip(): + aut.extend(string_to_authors(author.text)) + if aut: + mi.authors = aut + + desc = XPath('//dc:description')(root) + if desc: + raw = etree.tostring(desc[0], method='text', encoding=unicode) + mi.comments = raw + + langs = [] + for lang in XPath('//dc:language')(root): + if lang.text and lang.text.strip(): + l = canonicalize_lang(lang.text) + if l: + langs.append(l) + if langs: + mi.languages = langs + +def read_app_props(raw, mi): + root = fromstring(raw) + company = root.xpath('//*[local-name()="Company"]') + if company and company[0].text and company[0].text.strip(): + mi.publisher = company[0].text.strip() +# }}} class DOCX(object): - def __init__(self, path_or_stream, log=None): + def __init__(self, path_or_stream, log=None, extract=True): stream = path_or_stream if hasattr(path_or_stream, 'read') else open(path_or_stream, 'rb') self.name = getattr(stream, 'name', None) or '' self.log = log or default_log - self.tdir = PersistentTemporaryDirectory('docx_container') - - self.extract(stream) + if extract: + self.extract(stream) + else: + self.init_zipfile(stream) self.read_content_types() self.read_package_relationships() + def init_zipfile(self, stream): + self.zipf = ZipFile(stream) + self.names = frozenset(self.zipf.namelist()) + def extract(self, stream): + self.tdir = PersistentTemporaryDirectory('docx_container') try: zf = ZipFile(stream) zf.extractall(self.tdir) @@ -46,6 +106,8 @@ class DOCX(object): self.names[name] = f def read(self, name): + if hasattr(self, 'zipf'): + return self.zipf.open(name).read() path = self.names[name] with open(path, 'rb') as f: return f.read() @@ -55,7 +117,7 @@ class DOCX(object): raw = self.read('[Content_Types].xml') except KeyError: raise InvalidDOCX('The file %s docx file has no [Content_Types].xml' % self.name) - root = etree.fromstring(raw) + root = fromstring(raw) self.content_types = {} self.default_content_types = {} for item in root.xpath('//*[local-name()="Types"]/*[local-name()="Default" and @Extension and @ContentType]'): @@ -77,7 +139,7 @@ class DOCX(object): raw = self.read('_rels/.rels') except KeyError: raise InvalidDOCX('The file %s docx file has no _rels/.rels' % self.name) - root = etree.fromstring(raw) + root = fromstring(raw) self.relationships = {} self.relationships_rmap = {} for item in root.xpath('//*[local-name()="Relationships"]/*[local-name()="Relationship" and @Type and @Target]'): @@ -94,7 +156,48 @@ class DOCX(object): if not names: raise InvalidDOCX('The file %s docx file has no main document' % self.name) name = names[0] - return etree.fromstring(self.read(name)) + return fromstring(self.read(name)) + + @property + def metadata(self): + mi = Metadata(_('Unknown')) + name = self.relationships.get(DOCPROPS, None) + if name is None: + names = tuple(n for n in self.names if n.lower() == 'docprops/core.xml') + if names: + name = names[0] + if name: + try: + raw = self.read(name) + except KeyError: + pass + else: + read_doc_props(raw, mi) + + name = self.relationships.get(APPPROPS, None) + if name is None: + names = tuple(n for n in self.names if n.lower() == 'docprops/app.xml') + if names: + name = names[0] + if name: + try: + raw = self.read(name) + except KeyError: + pass + else: + read_app_props(raw, mi) + + return mi + + def close(self): + if hasattr(self, 'zipf'): + self.zipf.close() + else: + try: + shutil.rmtree(self.tdir) + except EnvironmentError: + pass if __name__ == '__main__': - d = DOCX(sys.argv[-1]) + d = DOCX(sys.argv[-1], extract=False) + print (d.metadata) diff --git a/src/calibre/ebooks/docx/names.py b/src/calibre/ebooks/docx/names.py index 0a31d08ab7..9080377b36 100644 --- a/src/calibre/ebooks/docx/names.py +++ b/src/calibre/ebooks/docx/names.py @@ -6,7 +6,11 @@ from __future__ import (unicode_literals, division, absolute_import, __license__ = 'GPL v3' __copyright__ = '2013, Kovid Goyal ' +from lxml.etree import XPath as X + DOCUMENT = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument' +DOCPROPS = 'http://schemas.openxmlformats.org/package/2006/relationships/metadata/core-properties' +APPPROPS = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/extended-properties' namespaces = { 'mo': 'http://schemas.microsoft.com/office/mac/office/2008/main', @@ -38,3 +42,6 @@ namespaces = { 'dcterms': 'http://purl.org/dc/terms/' } +def XPath(expr): + return X(expr, namespaces=namespaces) + diff --git a/src/calibre/ebooks/docx/to_html.py b/src/calibre/ebooks/docx/to_html.py new file mode 100644 index 0000000000..b2a5de4691 --- /dev/null +++ b/src/calibre/ebooks/docx/to_html.py @@ -0,0 +1,41 @@ +#!/usr/bin/env python +# vim:fileencoding=utf-8 +from __future__ import (unicode_literals, division, absolute_import, + print_function) + +__license__ = 'GPL v3' +__copyright__ = '2013, Kovid Goyal ' + +import sys, os + +from lxml import html +from lxml.html.builder import (HTML, HEAD, TITLE, BODY, LINK, META) + +from calibre.ebooks.docx.container import Container + +class Convert(object): + + def __init__(self, path_or_stream, dest_dir=None, log=None): + self.container = Container(path_or_stream, log=log) + self.log = self.container.log + self.dest_dir = dest_dir or os.getcwdu() + self.body = BODY() + self.html = HTML( + HEAD( + META(charset='utf-8'), + TITLE('TODO: read from metadata'), + LINK(rel='stylesheet', type='text/css', href='docx.css'), + ), + self.body + ) + + def __call__(self): + self.write() + + def write(self): + raw = html.tostring(self.html, encoding='utf-8', doctype='') + with open(os.path.join(self.dest_dir, 'index.html'), 'wb') as f: + f.write(raw) + +if __name__ == '__main__': + Convert(sys.argv[-1])() diff --git a/src/calibre/ebooks/metadata/docx.py b/src/calibre/ebooks/metadata/docx.py index cb265424cc..31b0c48974 100644 --- a/src/calibre/ebooks/metadata/docx.py +++ b/src/calibre/ebooks/metadata/docx.py @@ -7,80 +7,21 @@ __license__ = 'GPL v3' __copyright__ = '2012, Kovid Goyal ' __docformat__ = 'restructuredtext en' -from lxml import etree +from calibre.ebooks.docx.container import DOCX -from calibre.ebooks.metadata.book.base import Metadata -from calibre.utils.localization import canonicalize_lang from calibre.utils.zipfile import ZipFile from calibre.utils.magick.draw import identify_data -from calibre.ebooks.oeb.base import DC11_NS -from calibre.ebooks.oeb.parse_utils import RECOVER_PARSER - -NSMAP = {'dc':DC11_NS, -'cp':'http://schemas.openxmlformats.org/package/2006/metadata/core-properties'} - -def XPath(expr): - return etree.XPath(expr, namespaces=NSMAP) - -def _read_doc_props(raw, mi): - from calibre.ebooks.metadata import string_to_authors - root = etree.fromstring(raw, parser=RECOVER_PARSER) - titles = XPath('//dc:title')(root) - if titles: - title = titles[0].text - if title and title.strip(): - mi.title = title.strip() - tags = [] - for subject in XPath('//dc:subject')(root): - if subject.text and subject.text.strip(): - tags.append(subject.text.strip().replace(',', '_')) - for keywords in XPath('//cp:keywords')(root): - if keywords.text and keywords.text.strip(): - for x in keywords.text.split(): - tags.extend(y.strip() for y in x.split(',')) - if tags: - mi.tags = tags - authors = XPath('//dc:creator')(root) - aut = [] - for author in authors: - if author.text and author.text.strip(): - aut.extend(string_to_authors(author.text)) - if aut: - mi.authors = aut - - desc = XPath('//dc:description')(root) - if desc: - raw = etree.tostring(desc[0], method='text', encoding=unicode) - mi.comments = raw - - langs = [] - for lang in XPath('//dc:language')(root): - if lang.text and lang.text.strip(): - l = canonicalize_lang(lang.text) - if l: - langs.append(l) - if langs: - mi.languages = langs - -def _read_app_props(raw, mi): - root = etree.fromstring(raw, parser=RECOVER_PARSER) - company = root.xpath('//*[local-name()="Company"]') - if company and company[0].text and company[0].text.strip(): - mi.publisher = company[0].text.strip() def get_metadata(stream): + c = DOCX(stream, extract=False) + mi = c.metadata + c.close() + stream.seek(0) + cdata = None with ZipFile(stream, 'r') as zf: - - mi = Metadata(_('Unknown')) - cdata = None - for zi in zf.infolist(): ext = zi.filename.rpartition('.')[-1].lower() - if zi.filename.lower() == 'docprops/core.xml': - _read_doc_props(zf.read(zi), mi) - elif zi.filename.lower() == 'docprops/app.xml': - _read_app_props(zf.read(zi), mi) - elif cdata is None and ext in {'jpeg', 'jpg', 'png', 'gif'}: + if cdata is None and ext in {'jpeg', 'jpg', 'png', 'gif'}: raw = zf.read(zi) try: width, height, fmt = identify_data(raw) From 4037971bded2766c26b31d23a12b3eaa36040c76 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 4 May 2013 14:16:17 +0530 Subject: [PATCH 09/11] Update The Sun --- recipes/the_sun.recipe | 58 ++++++++++++++---------------------------- 1 file changed, 19 insertions(+), 39 deletions(-) diff --git a/recipes/the_sun.recipe b/recipes/the_sun.recipe index 10027d4583..962aa67b91 100644 --- a/recipes/the_sun.recipe +++ b/recipes/the_sun.recipe @@ -1,4 +1,4 @@ -import re, random +import random from calibre import browser from calibre.web.feeds.recipes import BasicNewsRecipe @@ -8,7 +8,7 @@ class AdvancedUserRecipe1325006965(BasicNewsRecipe): title = u'The Sun UK' description = 'Articles from The Sun tabloid UK' __author__ = 'Dave Asbury' - # last updated 19/10/12 better cover fetch + # last updated 5/5/13 better cover fetch language = 'en_GB' oldest_article = 1 max_articles_per_feed = 15 @@ -29,16 +29,12 @@ class AdvancedUserRecipe1325006965(BasicNewsRecipe): dict(name='div',attrs={'class' : 'intro'}), dict(name='h3'), dict(name='div',attrs={'id' : 'articlebody'}), - #dict(attrs={'class' : ['right_col_branding','related-stories','mystery-meat-link','ltbx-container','ltbx-var ltbx-hbxpn','ltbx-var ltbx-nav-loop','ltbx-var ltbx-url']}), - # dict(name='div',attrs={'class' : 'cf'}), - # dict(attrs={'title' : 'download flash'}), - # dict(attrs={'style' : 'padding: 5px'}) - ] + ] remove_tags_after = [dict(id='bodyText')] remove_tags=[ - dict(name='li'), - dict(attrs={'class' : 'grid-4 right-hand-column'}), + dict(name='li'), + dict(attrs={'class' : 'grid-4 right-hand-column'}), ] feeds = [ @@ -47,40 +43,24 @@ class AdvancedUserRecipe1325006965(BasicNewsRecipe): (u'Showbiz', u'http://www.thesun.co.uk/sol/homepage/showbiz/rss'), (u'Woman', u'http://www.thesun.co.uk/sol/homepage/woman/rss'), ] -# starsons code - def parse_feeds (self): - feeds = BasicNewsRecipe.parse_feeds(self) - for feed in feeds: - for article in feed.articles[:]: - print 'article.title is: ', article.title - if 'Try out The Sun' in article.title.upper() or 'Try-out-The-Suns' in article.url: - feed.articles.remove(article) - if 'Web porn harms kids' in article.title.upper() or 'Sun-says-Web-porn' in article.url: - feed.articles.remove(article) - return feeds + # starsons code + def parse_feeds(self): + feeds = BasicNewsRecipe.parse_feeds(self) + for feed in feeds: + for article in feed.articles[:]: + if 'Try out The Sun' in article.title.upper() or 'Try-out-The-Suns' in article.url: + feed.articles.remove(article) + if 'Web porn harms kids' in article.title.upper() or 'Sun-says-Web-porn' in article.url: + feed.articles.remove(article) + return feeds def get_cover_url(self): - soup = self.index_to_soup('http://www.politicshome.com/uk/latest_frontpage.html') - # look for the block containing the sun button and url - cov = soup.find(attrs={'style' : 'background-image: url(http://www.politicshome.com/images/sources/source_frontpage_button_84.gif);'}) - - #cov = soup.find(attrs={'id' : 'large'}) - cov2 = str(cov) - - cov2='http://www.politicshome.com'+cov2[9:-133] - #cov2 now contains url of the page containing pic - #cov2 now contains url of the page containing pic - soup = self.index_to_soup(cov2) - cov = soup.find(attrs={'id' : 'large'}) - cov=str(cov) - cov2 = re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', cov) - cov2 = str(cov2) - cov2=cov2[2:len(cov2)-2] br = browser() br.set_handle_redirect(False) + cover_url = 'http://www.thepaperboy.com/frontpages/current/The_Sun_newspaper_front_page.jpg' + try: - br.open_novisit(cov2) - cover_url = cov2 + br.open_novisit('http://www.thepaperboy.com/frontpages/current/The_Sun_newspaper_front_page.jpg') except: cover_url = random.choice([ 'http://img.thesun.co.uk/multimedia/archive/00905/errorpage6_677961a_905507a.jpg' @@ -88,6 +68,6 @@ class AdvancedUserRecipe1325006965(BasicNewsRecipe): ,'http://img.thesun.co.uk/multimedia/archive/00905/errorpage5_677960a_905512a.jpg' ,'http://img.thesun.co.uk/multimedia/archive/00905/errorpage2_677957a_905502a.jpg' ,'http://img.thesun.co.uk/multimedia/archive/00905/errorpage3_677958a_905503a.jpg' - ]) + ]) return cover_url From d6a8e92dcdcd7c9405ccb382984190b0c35c5d0f Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 4 May 2013 14:24:10 +0530 Subject: [PATCH 10/11] When changing to a virtual library, refresh the Book Details panel. Fixes #1176296 (Virtual Library - Wrong Book Highlighted When Switching Libraries) --- src/calibre/gui2/search_restriction_mixin.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/calibre/gui2/search_restriction_mixin.py b/src/calibre/gui2/search_restriction_mixin.py index f3055341bb..1a9ea621a9 100644 --- a/src/calibre/gui2/search_restriction_mixin.py +++ b/src/calibre/gui2/search_restriction_mixin.py @@ -561,6 +561,10 @@ class SearchRestrictionMixin(object): self.set_number_of_books_shown() self.current_view().setFocus(Qt.OtherFocusReason) self.set_window_title() + v = self.current_view() + if not v.currentIndex().isValid(): + v.set_current_row() + v.refresh_book_details() def set_number_of_books_shown(self): db = self.library_view.model().db From e7268bc39fc3e3576e3a5c23a3dda32583aadd5d Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 4 May 2013 14:30:43 +0530 Subject: [PATCH 11/11] Add keyboard shortcut to clear additional restriction --- manual/gui.rst | 2 ++ src/calibre/gui2/search_restriction_mixin.py | 3 +++ src/calibre/gui2/ui.py | 7 +++++++ 3 files changed, 12 insertions(+) diff --git a/manual/gui.rst b/manual/gui.rst index 6d75f65542..f973008ffd 100755 --- a/manual/gui.rst +++ b/manual/gui.rst @@ -586,6 +586,8 @@ Calibre has several keyboard shortcuts to save you time and mouse movement. Thes - Focus the book list * - :kbd:`Ctrl+Esc` - Clear the virtual library + * - :kbd:`Alt+Esc` + - Clear the additional restriction * - :kbd:`N or F3` - Find the next book that matches the current search (only works if the highlight checkbox next to the search bar is checked) * - :kbd:`Shift+N or Shift+F3` diff --git a/src/calibre/gui2/search_restriction_mixin.py b/src/calibre/gui2/search_restriction_mixin.py index 1a9ea621a9..c6965aaa6c 100644 --- a/src/calibre/gui2/search_restriction_mixin.py +++ b/src/calibre/gui2/search_restriction_mixin.py @@ -549,6 +549,9 @@ class SearchRestrictionMixin(object): restriction = '' self._apply_search_restriction(restriction, r) + def clear_additional_restriction(self): + self._apply_search_restriction('', '') + def _apply_search_restriction(self, restriction, name): self.saved_search.clear() # The order below is important. Set the restriction, force a '' search diff --git a/src/calibre/gui2/ui.py b/src/calibre/gui2/ui.py index 8a5f1ffbb5..aafea4ef2b 100644 --- a/src/calibre/gui2/ui.py +++ b/src/calibre/gui2/ui.py @@ -279,6 +279,13 @@ class Main(MainWindow, MainWindowMixin, DeviceMixin, EmailMixin, # {{{ action=self.ctrl_esc_action) self.ctrl_esc_action.triggered.connect(self.ctrl_esc) + self.alt_esc_action = QAction(self) + self.addAction(self.alt_esc_action) + self.keyboard.register_shortcut('clear additional restriction', + _('Clear the additional restriction'), default_keys=('Alt+Esc',), + action=self.alt_esc_action) + self.alt_esc_action.triggered.connect(self.clear_additional_restriction) + ####################### Start spare job server ######################## QTimer.singleShot(1000, self.add_spare_server)