Merge from trunk

This commit is contained in:
Charles Haley 2013-05-11 12:00:46 +02:00
commit cb93fd7329
7 changed files with 311 additions and 12 deletions

75
recipes/nrc_next.recipe Normal file
View File

@ -0,0 +1,75 @@
#!/usr/bin/env python2
# -*- coding: utf-8 -*-
# Based on veezh's original recipe, Kovid Goyal's New York Times recipe and Snaabs nrc Handelsblad recipe
__license__ = 'GPL v3'
__copyright__ = '2013, Niels Giesen'
'''
www.nrc.nl
'''
import os, zipfile
import time
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ptempfile import PersistentTemporaryFile
class NRCNext(BasicNewsRecipe):
title = u'nrc•next'
description = u'De ePaper-versie van nrc•next'
language = 'nl'
lang = 'nl-NL'
needs_subscription = True
__author__ = 'Niels Giesen'
conversion_options = {
'no_default_epub_cover' : True
}
def get_browser(self):
br = BasicNewsRecipe.get_browser(self)
if self.username is not None and self.password is not None:
br.open('http://login.nrc.nl/login')
br.select_form(nr=0)
br['username'] = self.username
br['password'] = self.password
br.submit()
return br
def build_index(self):
today = time.strftime("%Y%m%d")
domain = "http://digitaleeditie.nrc.nl"
url = domain + "/digitaleeditie/helekrant/epub/nn_" + today + ".epub"
#print url
try:
br = self.get_browser()
f = br.open(url)
except:
self.report_progress(0,_('Kan niet inloggen om editie te downloaden'))
raise ValueError('Krant van vandaag nog niet beschikbaar')
tmp = PersistentTemporaryFile(suffix='.epub')
self.report_progress(0,_('downloading epub'))
tmp.write(f.read())
f.close()
br.close()
if zipfile.is_zipfile(tmp):
try:
zfile = zipfile.ZipFile(tmp.name, 'r')
zfile.extractall(self.output_dir)
self.report_progress(0,_('extracting epub'))
except zipfile.BadZipfile:
self.report_progress(0,_('BadZip error, continuing'))
tmp.close()
index = os.path.join(self.output_dir, 'metadata.opf')
self.report_progress(1,_('epub downloaded and extracted'))
return index

View File

@ -32,7 +32,7 @@ defaults.
# Set the use_series_auto_increment_tweak_when_importing tweak to True to
# use the above values when importing/adding books. If this tweak is set to
# False (the default) then the series number will be set to 1 if it is not
# explicitly set to during the import. If set to True, then the
# explicitly set during the import. If set to True, then the
# series index will be set according to the series_index_auto_increment setting.
# Note that the use_series_auto_increment_tweak_when_importing tweak is used
# only when a value is not provided during import. If the importing regular

View File

@ -0,0 +1,37 @@
#!/usr/bin/env python
# vim:fileencoding=utf-8
from __future__ import (unicode_literals, division, absolute_import,
print_function)
__license__ = 'GPL v3'
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
import sys, os, shutil
from lxml import etree
from calibre import walk
from calibre.utils.zipfile import ZipFile
def dump(path):
dest = os.path.splitext(os.path.basename(path))[0]
dest += '_extracted'
if os.path.exists(dest):
shutil.rmtree(dest)
with ZipFile(path) as zf:
zf.extractall(dest)
for f in walk(dest):
if f.endswith('.xml'):
with open(f, 'r+b') as stream:
raw = stream.read()
root = etree.fromstring(raw)
stream.seek(0)
stream.truncate()
stream.write(etree.tostring(root, pretty_print=True, encoding='utf-8', xml_declaration=True))
print (path, 'dumped to', dest)
if __name__ == '__main__':
dump(sys.argv[-1])

View File

@ -12,6 +12,7 @@ DOCUMENT = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/
DOCPROPS = 'http://schemas.openxmlformats.org/package/2006/relationships/metadata/core-properties'
APPPROPS = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/extended-properties'
STYLES = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/styles'
NUMBERING = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/numbering'
namespaces = {
'mo': 'http://schemas.microsoft.com/office/mac/office/2008/main',

View File

@ -0,0 +1,156 @@
#!/usr/bin/env python
# vim:fileencoding=utf-8
from __future__ import (unicode_literals, division, absolute_import,
print_function)
__license__ = 'GPL v3'
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
from calibre.ebooks.docx.block_styles import ParagraphStyle
from calibre.ebooks.docx.char_styles import RunStyle
from calibre.ebooks.docx.names import XPath, get
STYLE_MAP = {
'aiueo': 'hiragana',
'aiueoFullWidth': 'hiragana',
'hebrew1': 'hebrew',
'iroha': 'katakana-iroha',
'irohaFullWidth': 'katakana-iroha',
'lowerLetter': 'lower-alpha',
'lowerRoman': 'lower-roman',
'none': 'none',
'upperLetter': 'upper-alpha',
'upperRoman': 'upper-roman',
'chineseCounting': 'cjk-ideographic',
'decimalZero': 'decimal-leading-zero',
}
class Level(object):
def __init__(self, lvl=None):
self.restart = None
self.start = 0
self.fmt = 'decimal'
self.para_link = None
self.paragraph_style = self.character_style = None
if lvl is not None:
self.read_from_xml(lvl)
def read_from_xml(self, lvl, override=False):
for lr in XPath('./w:lvlRestart[@w:val]')(lvl):
try:
self.restart = int(get(lr, 'w:val'))
except (TypeError, ValueError):
pass
for lr in XPath('./w:start[@w:val]')(lvl):
try:
self.start = int(get(lr, 'w:val'))
except (TypeError, ValueError):
pass
lt = None
for lr in XPath('./w:lvlText[@w:val]')(lvl):
lt = get(lr, 'w:val')
for lr in XPath('./w:numFmt[@w:val]')(lvl):
val = get(lr, 'w:val')
if val == 'bullet':
self.fmt = {'\uf0a7':'square', 'o':'circle'}.get(lt, 'disc')
else:
self.fmt = STYLE_MAP.get(val, 'decimal')
for lr in XPath('./w:pStyle[@w:val]')(lvl):
self.para_link = get(lr, 'w:val')
for pPr in XPath('./w:pPr')(lvl):
ps = ParagraphStyle(pPr)
if self.paragraph_style is None:
self.paragraph_style = ps
else:
self.paragraph_style.update(ps)
for rPr in XPath('./w:rPr')(lvl):
ps = RunStyle(rPr)
if self.character_style is None:
self.character_style = ps
else:
self.character_style.update(ps)
def copy(self):
ans = Level()
for x in ('restart', 'start', 'fmt', 'para_link', 'paragraph_style', 'character_style'):
setattr(ans, x, getattr(self, x))
return ans
class NumberingDefinition(object):
def __init__(self, parent=None):
self.levels = {}
if parent is not None:
for lvl in XPath('./w:lvl')(parent):
try:
ilvl = int(get(lvl, 'w:ilvl', 0))
except (TypeError, ValueError):
ilvl = 0
self.levels[ilvl] = Level(lvl)
def copy(self):
ans = NumberingDefinition()
for l, lvl in self.levels.iteritems():
ans.levels[l] = lvl.copy()
return ans
class Numbering(object):
def __init__(self):
self.definitions = {}
self.instances = {}
def __call__(self, root, styles):
' Read all numbering style definitions '
lazy_load = {}
for an in XPath('./w:abstractNum[@w:abstractNumId]')(root):
an_id = get(an, 'w:abstractNumId')
nsl = XPath('./w:numStyleLink[@w:val]')(an)
if nsl:
lazy_load[an_id] = get(nsl[0], 'w:val')
else:
nd = NumberingDefinition(an)
self.definitions[an_id] = nd
def create_instance(n, definition):
nd = definition.copy()
for lo in XPath('./w:lvlOverride')(n):
ilvl = get(lo, 'w:ilvl')
for lvl in XPath('./w:lvl')(lo)[:1]:
nilvl = get(lvl, 'w:ilvl')
ilvl = nilvl if ilvl is None else ilvl
alvl = nd.levels.get(ilvl, None)
if alvl is None:
alvl = Level()
alvl.read_from_xml(lvl, override=True)
next_pass = {}
for n in XPath('./w:num[@w:numId]')(root):
an_id = None
num_id = get(n, 'w:numId')
for an in XPath('./w:abstractNumId[@w:val]')(n):
an_id = get(an, 'w:val')
d = self.definitions.get(an_id, None)
if d is None:
next_pass[num_id] = (an_id, n)
continue
self.instances[num_id] = create_instance(n, d)
numbering_links = styles.numbering_style_links
for an_id, style_link in lazy_load.iteritems():
num_id = numbering_links[style_link]
self.definitions[an_id] = self.instances[num_id].copy()
for num_id, (an_id, n) in next_pass.iteritems():
d = self.definitions.get(an_id, None)
if d is not None:
self.instances[num_id] = create_instance(n, d)

View File

@ -52,6 +52,11 @@ class Style(object):
else:
self.character_style.update(rs)
if self.style_type == 'numbering':
self.numbering_style_link = None
for x in XPath('./w:pPr/w:numPr/w:numId[@w:val]')(elem):
self.numbering_style_link = get(x, 'w:val')
def resolve_based_on(self, parent):
if parent.paragraph_style is not None:
if self.paragraph_style is None:
@ -77,6 +82,7 @@ class Styles(object):
self.classes = {}
self.counter = Counter()
self.default_styles = {}
self.numbering_style_links = {}
def __iter__(self):
for s in self.id_map.itervalues():
@ -98,6 +104,8 @@ class Styles(object):
self.id_map[s.style_id] = s
if s.is_default:
self.default_styles[s.style_type] = s
if s.style_type == 'numbering' and s.numbering_style_link:
self.numbering_style_links[s.style_id] = s.numbering_style_link
self.default_paragraph_style = self.default_character_style = None
@ -235,6 +243,9 @@ class Styles(object):
if obj.tag.endswith('}r'):
return self.resolve_run(obj)
def resolve_numbering(self, numbering):
pass # TODO: Implement this
def register(self, css, prefix):
h = hash(tuple(css.iteritems()))
ans, _ = self.classes.get(h, (None, None))

View File

@ -13,8 +13,9 @@ from lxml.html.builder import (
HTML, HEAD, TITLE, BODY, LINK, META, P, SPAN, BR)
from calibre.ebooks.docx.container import DOCX, fromstring
from calibre.ebooks.docx.names import XPath, is_tag, barename, XML, STYLES
from calibre.ebooks.docx.names import XPath, is_tag, barename, XML, STYLES, NUMBERING
from calibre.ebooks.docx.styles import Styles, inherit
from calibre.ebooks.docx.numbering import Numbering
from calibre.utils.localization import canonicalize_lang, lang_as_iso639_1
class Text:
@ -89,12 +90,20 @@ class Convert(object):
self.write()
def read_styles(self, relationships_by_type):
sname = relationships_by_type.get(STYLES, None)
if sname is None:
name = self.docx.document_name.split('/')
name[-1] = 'styles.xml'
if self.docx.exists(name):
sname = name
def get_name(rtype, defname):
name = relationships_by_type.get(rtype, None)
if name is None:
cname = self.docx.document_name.split('/')
cname[-1] = defname
if self.docx.exists(cname):
name = name
return name
nname = get_name(NUMBERING, 'numbering.xml')
sname = get_name(STYLES, 'styles.xml')
numbering = Numbering()
if sname is not None:
try:
raw = self.docx.read(sname)
@ -103,6 +112,16 @@ class Convert(object):
else:
self.styles(fromstring(raw))
if nname is not None:
try:
raw = self.docx.read(nname)
except KeyError:
self.log.warn('Numbering styles %s do not exist' % nname)
else:
numbering(fromstring(raw), self.styles)
self.styles.resolve_numbering(numbering)
def write(self):
raw = html.tostring(self.html, encoding='utf-8', doctype='<!DOCTYPE html>')
with open(os.path.join(self.dest_dir, 'index.html'), 'wb') as f: