Merge from trunk

2025-12-17 18:45:04 -05:00 · 2013-05-11 12:00:46 +02:00 · 2013-05-11 12:00:46 +02:00 · cb93fd7329
commit cb93fd7329
parent 0ae22558a2 90374d24c4
7 changed files with 311 additions and 12 deletions
--- a/recipes/nrc_next.recipe
+++ b/recipes/nrc_next.recipe
@ -0,0 +1,75 @@
+#!/usr/bin/env  python2
+# -*- coding: utf-8 -*-
+# Based on veezh's original recipe, Kovid Goyal's New York Times recipe and Snaabs nrc Handelsblad recipe
+
+__license__   = 'GPL v3'
+__copyright__ = '2013, Niels Giesen'
+
+'''
+www.nrc.nl
+'''
+import os, zipfile
+import time
+from calibre.web.feeds.news import BasicNewsRecipe
+from calibre.ptempfile import PersistentTemporaryFile
+
+
+class NRCNext(BasicNewsRecipe):
+
+    title = u'nrc•next'
+    description = u'De ePaper-versie van nrc•next'
+    language = 'nl'
+    lang = 'nl-NL'
+    needs_subscription = True
+
+    __author__ = 'Niels Giesen'
+
+    conversion_options = {
+        'no_default_epub_cover' : True
+    }
+
+    def get_browser(self):
+        br = BasicNewsRecipe.get_browser(self)
+        if self.username is not None and self.password is not None:
+            br.open('http://login.nrc.nl/login')
+            br.select_form(nr=0)
+            br['username'] = self.username
+            br['password'] = self.password
+            br.submit()
+        return br
+
+    def build_index(self):
+
+        today = time.strftime("%Y%m%d")
+
+        domain = "http://digitaleeditie.nrc.nl"
+
+        url = domain + "/digitaleeditie/helekrant/epub/nn_" + today + ".epub"
+        #print url
+
+        try:
+            br = self.get_browser()
+            f = br.open(url)
+        except:
+            self.report_progress(0,_('Kan niet inloggen om editie te downloaden'))
+            raise ValueError('Krant van vandaag nog niet beschikbaar')
+
+        tmp = PersistentTemporaryFile(suffix='.epub')
+        self.report_progress(0,_('downloading epub'))
+        tmp.write(f.read())
+        f.close()
+        br.close()
+        if zipfile.is_zipfile(tmp):
+            try:
+                zfile = zipfile.ZipFile(tmp.name, 'r')
+                zfile.extractall(self.output_dir)
+                self.report_progress(0,_('extracting epub'))
+            except zipfile.BadZipfile:
+                self.report_progress(0,_('BadZip error, continuing'))
+
+        tmp.close()
+        index = os.path.join(self.output_dir, 'metadata.opf')
+
+        self.report_progress(1,_('epub downloaded and extracted'))
+
+        return index
--- a/resources/default_tweaks.py
+++ b/resources/default_tweaks.py
@ -32,7 +32,7 @@ defaults.
 # Set the use_series_auto_increment_tweak_when_importing tweak to True to
 # use the above values when importing/adding books. If this tweak is set to
 # False (the default) then the series number will be set to 1 if it is not
-# explicitly set to during the import. If set to True, then the
+# explicitly set during the import. If set to True, then the
 # series index will be set according to the series_index_auto_increment setting.
 # Note that the use_series_auto_increment_tweak_when_importing tweak is used
 # only when a value is not provided during import. If the importing regular
--- a/src/calibre/ebooks/docx/dump.py
+++ b/src/calibre/ebooks/docx/dump.py
@ -0,0 +1,37 @@
+#!/usr/bin/env python
+# vim:fileencoding=utf-8
+from __future__ import (unicode_literals, division, absolute_import,
+                        print_function)
+
+__license__ = 'GPL v3'
+__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
+
+import sys, os, shutil
+
+from lxml import etree
+
+from calibre import walk
+from calibre.utils.zipfile import ZipFile
+
+def dump(path):
+    dest = os.path.splitext(os.path.basename(path))[0]
+    dest += '_extracted'
+    if os.path.exists(dest):
+        shutil.rmtree(dest)
+    with ZipFile(path) as zf:
+        zf.extractall(dest)
+
+    for f in walk(dest):
+        if f.endswith('.xml'):
+            with open(f, 'r+b') as stream:
+                raw = stream.read()
+                root = etree.fromstring(raw)
+                stream.seek(0)
+                stream.truncate()
+                stream.write(etree.tostring(root, pretty_print=True, encoding='utf-8', xml_declaration=True))
+
+    print (path, 'dumped to', dest)
+
+if __name__ == '__main__':
+    dump(sys.argv[-1])
+
--- a/src/calibre/ebooks/docx/names.py
+++ b/src/calibre/ebooks/docx/names.py
@ -12,6 +12,7 @@ DOCUMENT = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/
 DOCPROPS  = 'http://schemas.openxmlformats.org/package/2006/relationships/metadata/core-properties'
 APPPROPS  = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/extended-properties'
 STYLES    = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/styles'
+NUMBERING = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/numbering'

 namespaces = {
    'mo': 'http://schemas.microsoft.com/office/mac/office/2008/main',
--- a/src/calibre/ebooks/docx/numbering.py
+++ b/src/calibre/ebooks/docx/numbering.py
@ -0,0 +1,156 @@
+#!/usr/bin/env python
+# vim:fileencoding=utf-8
+from __future__ import (unicode_literals, division, absolute_import,
+                        print_function)
+
+__license__ = 'GPL v3'
+__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
+
+from calibre.ebooks.docx.block_styles import ParagraphStyle
+from calibre.ebooks.docx.char_styles import RunStyle
+from calibre.ebooks.docx.names import XPath, get
+
+STYLE_MAP = {
+    'aiueo': 'hiragana',
+    'aiueoFullWidth': 'hiragana',
+    'hebrew1': 'hebrew',
+    'iroha': 'katakana-iroha',
+    'irohaFullWidth': 'katakana-iroha',
+    'lowerLetter': 'lower-alpha',
+    'lowerRoman': 'lower-roman',
+    'none': 'none',
+    'upperLetter': 'upper-alpha',
+    'upperRoman': 'upper-roman',
+    'chineseCounting': 'cjk-ideographic',
+    'decimalZero': 'decimal-leading-zero',
+}
+
+class Level(object):
+
+    def __init__(self, lvl=None):
+        self.restart = None
+        self.start = 0
+        self.fmt = 'decimal'
+        self.para_link = None
+        self.paragraph_style = self.character_style = None
+
+        if lvl is not None:
+            self.read_from_xml(lvl)
+
+    def read_from_xml(self, lvl, override=False):
+        for lr in XPath('./w:lvlRestart[@w:val]')(lvl):
+            try:
+                self.restart = int(get(lr, 'w:val'))
+            except (TypeError, ValueError):
+                pass
+
+        for lr in XPath('./w:start[@w:val]')(lvl):
+            try:
+                self.start = int(get(lr, 'w:val'))
+            except (TypeError, ValueError):
+                pass
+
+        lt = None
+        for lr in XPath('./w:lvlText[@w:val]')(lvl):
+            lt = get(lr, 'w:val')
+
+        for lr in XPath('./w:numFmt[@w:val]')(lvl):
+            val = get(lr, 'w:val')
+            if val == 'bullet':
+                self.fmt = {'\uf0a7':'square', 'o':'circle'}.get(lt, 'disc')
+            else:
+                self.fmt = STYLE_MAP.get(val, 'decimal')
+
+        for lr in XPath('./w:pStyle[@w:val]')(lvl):
+            self.para_link = get(lr, 'w:val')
+
+        for pPr in XPath('./w:pPr')(lvl):
+            ps = ParagraphStyle(pPr)
+            if self.paragraph_style is None:
+                self.paragraph_style = ps
+            else:
+                self.paragraph_style.update(ps)
+
+        for rPr in XPath('./w:rPr')(lvl):
+            ps = RunStyle(rPr)
+            if self.character_style is None:
+                self.character_style = ps
+            else:
+                self.character_style.update(ps)
+
+    def copy(self):
+        ans = Level()
+        for x in ('restart', 'start', 'fmt', 'para_link', 'paragraph_style', 'character_style'):
+            setattr(ans, x, getattr(self, x))
+        return ans
+
+class NumberingDefinition(object):
+
+    def __init__(self, parent=None):
+        self.levels = {}
+        if parent is not None:
+            for lvl in XPath('./w:lvl')(parent):
+                try:
+                    ilvl = int(get(lvl, 'w:ilvl', 0))
+                except (TypeError, ValueError):
+                    ilvl = 0
+                self.levels[ilvl] = Level(lvl)
+
+    def copy(self):
+        ans = NumberingDefinition()
+        for l, lvl in self.levels.iteritems():
+            ans.levels[l] = lvl.copy()
+        return ans
+
+class Numbering(object):
+
+    def __init__(self):
+        self.definitions = {}
+        self.instances = {}
+
+    def __call__(self, root, styles):
+        ' Read all numbering style definitions '
+        lazy_load = {}
+        for an in XPath('./w:abstractNum[@w:abstractNumId]')(root):
+            an_id = get(an, 'w:abstractNumId')
+            nsl = XPath('./w:numStyleLink[@w:val]')(an)
+            if nsl:
+                lazy_load[an_id] = get(nsl[0], 'w:val')
+            else:
+                nd = NumberingDefinition(an)
+                self.definitions[an_id] = nd
+
+        def create_instance(n, definition):
+            nd = definition.copy()
+            for lo in XPath('./w:lvlOverride')(n):
+                ilvl = get(lo, 'w:ilvl')
+                for lvl in XPath('./w:lvl')(lo)[:1]:
+                    nilvl = get(lvl, 'w:ilvl')
+                    ilvl = nilvl if ilvl is None else ilvl
+                    alvl = nd.levels.get(ilvl, None)
+                    if alvl is None:
+                        alvl = Level()
+                    alvl.read_from_xml(lvl, override=True)
+
+        next_pass = {}
+        for n in XPath('./w:num[@w:numId]')(root):
+            an_id = None
+            num_id = get(n, 'w:numId')
+            for an in XPath('./w:abstractNumId[@w:val]')(n):
+                an_id = get(an, 'w:val')
+            d = self.definitions.get(an_id, None)
+            if d is None:
+                next_pass[num_id] = (an_id, n)
+                continue
+            self.instances[num_id] = create_instance(n, d)
+
+        numbering_links = styles.numbering_style_links
+        for an_id, style_link in lazy_load.iteritems():
+            num_id = numbering_links[style_link]
+            self.definitions[an_id] = self.instances[num_id].copy()
+
+        for num_id, (an_id, n) in next_pass.iteritems():
+            d = self.definitions.get(an_id, None)
+            if d is not None:
+                self.instances[num_id] = create_instance(n, d)
+
--- a/src/calibre/ebooks/docx/styles.py
+++ b/src/calibre/ebooks/docx/styles.py
@ -52,6 +52,11 @@ class Style(object):
                else:
                    self.character_style.update(rs)

+        if self.style_type == 'numbering':
+            self.numbering_style_link = None
+            for x in XPath('./w:pPr/w:numPr/w:numId[@w:val]')(elem):
+                self.numbering_style_link = get(x, 'w:val')
+
    def resolve_based_on(self, parent):
        if parent.paragraph_style is not None:
            if self.paragraph_style is None:
@ -77,6 +82,7 @@ class Styles(object):
        self.classes = {}
        self.counter = Counter()
        self.default_styles = {}
+        self.numbering_style_links = {}

    def __iter__(self):
        for s in self.id_map.itervalues():
@ -98,6 +104,8 @@ class Styles(object):
                self.id_map[s.style_id] = s
            if s.is_default:
                self.default_styles[s.style_type] = s
+            if s.style_type == 'numbering' and s.numbering_style_link:
+                self.numbering_style_links[s.style_id] = s.numbering_style_link

        self.default_paragraph_style = self.default_character_style = None

@ -235,6 +243,9 @@ class Styles(object):
        if obj.tag.endswith('}r'):
            return self.resolve_run(obj)

+    def resolve_numbering(self, numbering):
+        pass  # TODO: Implement this
+
    def register(self, css, prefix):
        h = hash(tuple(css.iteritems()))
        ans, _ = self.classes.get(h, (None, None))
--- a/src/calibre/ebooks/docx/to_html.py
+++ b/src/calibre/ebooks/docx/to_html.py
@ -13,8 +13,9 @@ from lxml.html.builder import (
    HTML, HEAD, TITLE, BODY, LINK, META, P, SPAN, BR)

 from calibre.ebooks.docx.container import DOCX, fromstring
-from calibre.ebooks.docx.names import XPath, is_tag, barename, XML, STYLES
+from calibre.ebooks.docx.names import XPath, is_tag, barename, XML, STYLES, NUMBERING
 from calibre.ebooks.docx.styles import Styles, inherit
+from calibre.ebooks.docx.numbering import Numbering
 from calibre.utils.localization import canonicalize_lang, lang_as_iso639_1

 class Text:
@ -89,12 +90,20 @@ class Convert(object):
        self.write()

    def read_styles(self, relationships_by_type):
-        sname = relationships_by_type.get(STYLES, None)
-        if sname is None:
-            name = self.docx.document_name.split('/')
-            name[-1] = 'styles.xml'
-            if self.docx.exists(name):
-                sname = name
+
+        def get_name(rtype, defname):
+            name = relationships_by_type.get(rtype, None)
+            if name is None:
+                cname = self.docx.document_name.split('/')
+                cname[-1] = defname
+                if self.docx.exists(cname):
+                    name = name
+            return name
+
+        nname = get_name(NUMBERING, 'numbering.xml')
+        sname = get_name(STYLES, 'styles.xml')
+        numbering = Numbering()
+
        if sname is not None:
            try:
                raw = self.docx.read(sname)
@ -103,6 +112,16 @@ class Convert(object):
            else:
                self.styles(fromstring(raw))

+        if nname is not None:
+            try:
+                raw = self.docx.read(nname)
+            except KeyError:
+                self.log.warn('Numbering styles %s do not exist' % nname)
+            else:
+                numbering(fromstring(raw), self.styles)
+
+        self.styles.resolve_numbering(numbering)
+
    def write(self):
        raw = html.tostring(self.html, encoding='utf-8', doctype='<!DOCTYPE html>')
        with open(os.path.join(self.dest_dir, 'index.html'), 'wb') as f: