diff --git a/recipes/nrc_next.recipe b/recipes/nrc_next.recipe new file mode 100644 index 0000000000..bd23a37c65 --- /dev/null +++ b/recipes/nrc_next.recipe @@ -0,0 +1,75 @@ +#!/usr/bin/env python2 +# -*- coding: utf-8 -*- +# Based on veezh's original recipe, Kovid Goyal's New York Times recipe and Snaabs nrc Handelsblad recipe + +__license__ = 'GPL v3' +__copyright__ = '2013, Niels Giesen' + +''' +www.nrc.nl +''' +import os, zipfile +import time +from calibre.web.feeds.news import BasicNewsRecipe +from calibre.ptempfile import PersistentTemporaryFile + + +class NRCNext(BasicNewsRecipe): + + title = u'nrc•next' + description = u'De ePaper-versie van nrc•next' + language = 'nl' + lang = 'nl-NL' + needs_subscription = True + + __author__ = 'Niels Giesen' + + conversion_options = { + 'no_default_epub_cover' : True + } + + def get_browser(self): + br = BasicNewsRecipe.get_browser(self) + if self.username is not None and self.password is not None: + br.open('http://login.nrc.nl/login') + br.select_form(nr=0) + br['username'] = self.username + br['password'] = self.password + br.submit() + return br + + def build_index(self): + + today = time.strftime("%Y%m%d") + + domain = "http://digitaleeditie.nrc.nl" + + url = domain + "/digitaleeditie/helekrant/epub/nn_" + today + ".epub" + #print url + + try: + br = self.get_browser() + f = br.open(url) + except: + self.report_progress(0,_('Kan niet inloggen om editie te downloaden')) + raise ValueError('Krant van vandaag nog niet beschikbaar') + + tmp = PersistentTemporaryFile(suffix='.epub') + self.report_progress(0,_('downloading epub')) + tmp.write(f.read()) + f.close() + br.close() + if zipfile.is_zipfile(tmp): + try: + zfile = zipfile.ZipFile(tmp.name, 'r') + zfile.extractall(self.output_dir) + self.report_progress(0,_('extracting epub')) + except zipfile.BadZipfile: + self.report_progress(0,_('BadZip error, continuing')) + + tmp.close() + index = os.path.join(self.output_dir, 'metadata.opf') + + self.report_progress(1,_('epub downloaded and extracted')) + + return index diff --git a/resources/default_tweaks.py b/resources/default_tweaks.py index eac9e33d06..868da1ebf5 100644 --- a/resources/default_tweaks.py +++ b/resources/default_tweaks.py @@ -32,7 +32,7 @@ defaults. # Set the use_series_auto_increment_tweak_when_importing tweak to True to # use the above values when importing/adding books. If this tweak is set to # False (the default) then the series number will be set to 1 if it is not -# explicitly set to during the import. If set to True, then the +# explicitly set during the import. If set to True, then the # series index will be set according to the series_index_auto_increment setting. # Note that the use_series_auto_increment_tweak_when_importing tweak is used # only when a value is not provided during import. If the importing regular diff --git a/src/calibre/ebooks/docx/dump.py b/src/calibre/ebooks/docx/dump.py new file mode 100644 index 0000000000..f6432125c5 --- /dev/null +++ b/src/calibre/ebooks/docx/dump.py @@ -0,0 +1,37 @@ +#!/usr/bin/env python +# vim:fileencoding=utf-8 +from __future__ import (unicode_literals, division, absolute_import, + print_function) + +__license__ = 'GPL v3' +__copyright__ = '2013, Kovid Goyal ' + +import sys, os, shutil + +from lxml import etree + +from calibre import walk +from calibre.utils.zipfile import ZipFile + +def dump(path): + dest = os.path.splitext(os.path.basename(path))[0] + dest += '_extracted' + if os.path.exists(dest): + shutil.rmtree(dest) + with ZipFile(path) as zf: + zf.extractall(dest) + + for f in walk(dest): + if f.endswith('.xml'): + with open(f, 'r+b') as stream: + raw = stream.read() + root = etree.fromstring(raw) + stream.seek(0) + stream.truncate() + stream.write(etree.tostring(root, pretty_print=True, encoding='utf-8', xml_declaration=True)) + + print (path, 'dumped to', dest) + +if __name__ == '__main__': + dump(sys.argv[-1]) + diff --git a/src/calibre/ebooks/docx/names.py b/src/calibre/ebooks/docx/names.py index 2b5dcca653..29a7f0eb81 100644 --- a/src/calibre/ebooks/docx/names.py +++ b/src/calibre/ebooks/docx/names.py @@ -8,10 +8,11 @@ __copyright__ = '2013, Kovid Goyal ' from lxml.etree import XPath as X -DOCUMENT = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument' -DOCPROPS = 'http://schemas.openxmlformats.org/package/2006/relationships/metadata/core-properties' -APPPROPS = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/extended-properties' -STYLES = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/styles' +DOCUMENT = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument' +DOCPROPS = 'http://schemas.openxmlformats.org/package/2006/relationships/metadata/core-properties' +APPPROPS = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/extended-properties' +STYLES = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/styles' +NUMBERING = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/numbering' namespaces = { 'mo': 'http://schemas.microsoft.com/office/mac/office/2008/main', diff --git a/src/calibre/ebooks/docx/numbering.py b/src/calibre/ebooks/docx/numbering.py new file mode 100644 index 0000000000..fc1e65db6a --- /dev/null +++ b/src/calibre/ebooks/docx/numbering.py @@ -0,0 +1,156 @@ +#!/usr/bin/env python +# vim:fileencoding=utf-8 +from __future__ import (unicode_literals, division, absolute_import, + print_function) + +__license__ = 'GPL v3' +__copyright__ = '2013, Kovid Goyal ' + +from calibre.ebooks.docx.block_styles import ParagraphStyle +from calibre.ebooks.docx.char_styles import RunStyle +from calibre.ebooks.docx.names import XPath, get + +STYLE_MAP = { + 'aiueo': 'hiragana', + 'aiueoFullWidth': 'hiragana', + 'hebrew1': 'hebrew', + 'iroha': 'katakana-iroha', + 'irohaFullWidth': 'katakana-iroha', + 'lowerLetter': 'lower-alpha', + 'lowerRoman': 'lower-roman', + 'none': 'none', + 'upperLetter': 'upper-alpha', + 'upperRoman': 'upper-roman', + 'chineseCounting': 'cjk-ideographic', + 'decimalZero': 'decimal-leading-zero', +} + +class Level(object): + + def __init__(self, lvl=None): + self.restart = None + self.start = 0 + self.fmt = 'decimal' + self.para_link = None + self.paragraph_style = self.character_style = None + + if lvl is not None: + self.read_from_xml(lvl) + + def read_from_xml(self, lvl, override=False): + for lr in XPath('./w:lvlRestart[@w:val]')(lvl): + try: + self.restart = int(get(lr, 'w:val')) + except (TypeError, ValueError): + pass + + for lr in XPath('./w:start[@w:val]')(lvl): + try: + self.start = int(get(lr, 'w:val')) + except (TypeError, ValueError): + pass + + lt = None + for lr in XPath('./w:lvlText[@w:val]')(lvl): + lt = get(lr, 'w:val') + + for lr in XPath('./w:numFmt[@w:val]')(lvl): + val = get(lr, 'w:val') + if val == 'bullet': + self.fmt = {'\uf0a7':'square', 'o':'circle'}.get(lt, 'disc') + else: + self.fmt = STYLE_MAP.get(val, 'decimal') + + for lr in XPath('./w:pStyle[@w:val]')(lvl): + self.para_link = get(lr, 'w:val') + + for pPr in XPath('./w:pPr')(lvl): + ps = ParagraphStyle(pPr) + if self.paragraph_style is None: + self.paragraph_style = ps + else: + self.paragraph_style.update(ps) + + for rPr in XPath('./w:rPr')(lvl): + ps = RunStyle(rPr) + if self.character_style is None: + self.character_style = ps + else: + self.character_style.update(ps) + + def copy(self): + ans = Level() + for x in ('restart', 'start', 'fmt', 'para_link', 'paragraph_style', 'character_style'): + setattr(ans, x, getattr(self, x)) + return ans + +class NumberingDefinition(object): + + def __init__(self, parent=None): + self.levels = {} + if parent is not None: + for lvl in XPath('./w:lvl')(parent): + try: + ilvl = int(get(lvl, 'w:ilvl', 0)) + except (TypeError, ValueError): + ilvl = 0 + self.levels[ilvl] = Level(lvl) + + def copy(self): + ans = NumberingDefinition() + for l, lvl in self.levels.iteritems(): + ans.levels[l] = lvl.copy() + return ans + +class Numbering(object): + + def __init__(self): + self.definitions = {} + self.instances = {} + + def __call__(self, root, styles): + ' Read all numbering style definitions ' + lazy_load = {} + for an in XPath('./w:abstractNum[@w:abstractNumId]')(root): + an_id = get(an, 'w:abstractNumId') + nsl = XPath('./w:numStyleLink[@w:val]')(an) + if nsl: + lazy_load[an_id] = get(nsl[0], 'w:val') + else: + nd = NumberingDefinition(an) + self.definitions[an_id] = nd + + def create_instance(n, definition): + nd = definition.copy() + for lo in XPath('./w:lvlOverride')(n): + ilvl = get(lo, 'w:ilvl') + for lvl in XPath('./w:lvl')(lo)[:1]: + nilvl = get(lvl, 'w:ilvl') + ilvl = nilvl if ilvl is None else ilvl + alvl = nd.levels.get(ilvl, None) + if alvl is None: + alvl = Level() + alvl.read_from_xml(lvl, override=True) + + next_pass = {} + for n in XPath('./w:num[@w:numId]')(root): + an_id = None + num_id = get(n, 'w:numId') + for an in XPath('./w:abstractNumId[@w:val]')(n): + an_id = get(an, 'w:val') + d = self.definitions.get(an_id, None) + if d is None: + next_pass[num_id] = (an_id, n) + continue + self.instances[num_id] = create_instance(n, d) + + numbering_links = styles.numbering_style_links + for an_id, style_link in lazy_load.iteritems(): + num_id = numbering_links[style_link] + self.definitions[an_id] = self.instances[num_id].copy() + + for num_id, (an_id, n) in next_pass.iteritems(): + d = self.definitions.get(an_id, None) + if d is not None: + self.instances[num_id] = create_instance(n, d) + diff --git a/src/calibre/ebooks/docx/styles.py b/src/calibre/ebooks/docx/styles.py index 0b77ff5353..a17295aa61 100644 --- a/src/calibre/ebooks/docx/styles.py +++ b/src/calibre/ebooks/docx/styles.py @@ -52,6 +52,11 @@ class Style(object): else: self.character_style.update(rs) + if self.style_type == 'numbering': + self.numbering_style_link = None + for x in XPath('./w:pPr/w:numPr/w:numId[@w:val]')(elem): + self.numbering_style_link = get(x, 'w:val') + def resolve_based_on(self, parent): if parent.paragraph_style is not None: if self.paragraph_style is None: @@ -77,6 +82,7 @@ class Styles(object): self.classes = {} self.counter = Counter() self.default_styles = {} + self.numbering_style_links = {} def __iter__(self): for s in self.id_map.itervalues(): @@ -98,6 +104,8 @@ class Styles(object): self.id_map[s.style_id] = s if s.is_default: self.default_styles[s.style_type] = s + if s.style_type == 'numbering' and s.numbering_style_link: + self.numbering_style_links[s.style_id] = s.numbering_style_link self.default_paragraph_style = self.default_character_style = None @@ -235,6 +243,9 @@ class Styles(object): if obj.tag.endswith('}r'): return self.resolve_run(obj) + def resolve_numbering(self, numbering): + pass # TODO: Implement this + def register(self, css, prefix): h = hash(tuple(css.iteritems())) ans, _ = self.classes.get(h, (None, None)) diff --git a/src/calibre/ebooks/docx/to_html.py b/src/calibre/ebooks/docx/to_html.py index 04bb0b5061..7aa0383da6 100644 --- a/src/calibre/ebooks/docx/to_html.py +++ b/src/calibre/ebooks/docx/to_html.py @@ -13,8 +13,9 @@ from lxml.html.builder import ( HTML, HEAD, TITLE, BODY, LINK, META, P, SPAN, BR) from calibre.ebooks.docx.container import DOCX, fromstring -from calibre.ebooks.docx.names import XPath, is_tag, barename, XML, STYLES +from calibre.ebooks.docx.names import XPath, is_tag, barename, XML, STYLES, NUMBERING from calibre.ebooks.docx.styles import Styles, inherit +from calibre.ebooks.docx.numbering import Numbering from calibre.utils.localization import canonicalize_lang, lang_as_iso639_1 class Text: @@ -89,12 +90,20 @@ class Convert(object): self.write() def read_styles(self, relationships_by_type): - sname = relationships_by_type.get(STYLES, None) - if sname is None: - name = self.docx.document_name.split('/') - name[-1] = 'styles.xml' - if self.docx.exists(name): - sname = name + + def get_name(rtype, defname): + name = relationships_by_type.get(rtype, None) + if name is None: + cname = self.docx.document_name.split('/') + cname[-1] = defname + if self.docx.exists(cname): + name = name + return name + + nname = get_name(NUMBERING, 'numbering.xml') + sname = get_name(STYLES, 'styles.xml') + numbering = Numbering() + if sname is not None: try: raw = self.docx.read(sname) @@ -103,6 +112,16 @@ class Convert(object): else: self.styles(fromstring(raw)) + if nname is not None: + try: + raw = self.docx.read(nname) + except KeyError: + self.log.warn('Numbering styles %s do not exist' % nname) + else: + numbering(fromstring(raw), self.styles) + + self.styles.resolve_numbering(numbering) + def write(self): raw = html.tostring(self.html, encoding='utf-8', doctype='') with open(os.path.join(self.dest_dir, 'index.html'), 'wb') as f: