Merge from trunk

This commit is contained in:
Charles Haley 2013-05-14 16:46:41 +02:00
commit d80579a827
33 changed files with 753 additions and 198 deletions

View File

@ -1,32 +1,37 @@
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from calibre.web.feeds.news import BasicNewsRecipe
import re
class NetMagazineRecipe (BasicNewsRecipe):
__author__ = u'Marc Busqué <marc@lamarciana.com>'
__url__ = 'http://www.lamarciana.com'
__version__ = '1.0'
__license__ = 'GPL v3'
__copyright__ = u'2012, Marc Busqué <marc@lamarciana.com>'
title = u'.net magazine'
description = u'net is the worlds best-selling magazine for web designers and developers, featuring tutorials from leading agencies, interviews with the webs biggest names, and agenda-setting features on the hottest issues affecting the internet today.'
language = 'en'
tags = 'web development, software'
oldest_article = 7
remove_empty_feeds = True
no_stylesheets = True
cover_url = u'http://media.netmagazine.futurecdn.net/sites/all/themes/netmag/logo.png'
keep_only_tags = [
dict(name='article', attrs={'class': re.compile('^node.*$', re.IGNORECASE)})
]
remove_tags = [
dict(name='span', attrs={'class': 'comment-count'}),
dict(name='div', attrs={'class': 'item-list share-links'}),
dict(name='footer'),
]
remove_attributes = ['border', 'cellspacing', 'align', 'cellpadding', 'colspan', 'valign', 'vspace', 'hspace', 'alt', 'width', 'height', 'style']
extra_css = 'img {max-width: 100%; display: block; margin: auto;} .captioned-image div {text-align: center; font-style: italic;}'
class dotnetMagazine (BasicNewsRecipe):
__author__ = u'Bonni Salles'
__version__ = '1.0'
__license__ = 'GPL v3'
__copyright__ = u'2013, Bonni Salles'
title = '.net magazine'
oldest_article = 7
no_stylesheets = True
encoding = 'utf8'
use_embedded_content = False
language = 'en'
remove_empty_feeds = True
extra_css = ' body{font-family: Arial,Helvetica,sans-serif } img{margin-bottom: 0.4em} '
cover_url = u'http://media.netmagazine.futurecdn.net/sites/all/themes/netmag/logo.png'
remove_tags_after = dict(name='footer', id=lambda x:not x)
remove_tags_before = dict(name='header', id=lambda x:not x)
remove_tags = [
dict(name='div', attrs={'class': 'item-list'}),
dict(name='h4', attrs={'class': 'std-hdr'}),
dict(name='div', attrs={'class': 'item-list share-links'}), #removes share links
dict(name=['script', 'noscript']),
dict(name='div', attrs={'id': 'comments-form'}), #comment these out if you want the comments to show
dict(name='div', attrs={'id': re.compile('advertorial_block_($|| )')}),
dict(name='div', attrs={'id': 'right-col'}),
dict(name='div', attrs={'id': 'comments'}), #comment these out if you want the comments to show
dict(name='div', attrs={'class': 'item-list related-content'}),
feeds = [
(u'.net', u'http://feeds.feedburner.com/net/topstories'),
]
feeds = [
(u'net', u'http://feeds.feedburner.com/net/topstories')
]

View File

@ -66,10 +66,8 @@ else:
filesystem_encoding = 'utf-8'
# On linux, unicode arguments to os file functions are coerced to an ascii
# bytestring if sys.getfilesystemencoding() == 'ascii', which is
# just plain dumb. So issue a warning.
print ('WARNING: You do not have the LANG environment variable set correctly. '
'This will cause problems with non-ascii filenames. '
'Set it to something like en_US.UTF-8.\n')
# just plain dumb. This is fixed by the icu.py module which, when
# imported changes ascii to utf-8
except:
filesystem_encoding = 'utf-8'

View File

@ -240,7 +240,8 @@ class ANDROID(USBMS):
'ADVANCED', 'SGH-I727', 'USB_FLASH_DRIVER', 'ANDROID',
'S5830I_CARD', 'MID7042', 'LINK-CREATE', '7035', 'VIEWPAD_7E',
'NOVO7', 'MB526', '_USB#WYK7MSF8KE', 'TABLET_PC', 'F', 'MT65XX_MS',
'ICS', 'E400', '__FILE-STOR_GADG', 'ST80208-1', 'GT-S5660M_CARD', 'XT894']
'ICS', 'E400', '__FILE-STOR_GADG', 'ST80208-1', 'GT-S5660M_CARD', 'XT894', '_USB',
]
WINDOWS_CARD_A_MEM = ['ANDROID_PHONE', 'GT-I9000_CARD', 'SGH-I897',
'FILE-STOR_GADGET', 'SGH-T959_CARD', 'SGH-T959', 'SAMSUNG_ANDROID', 'GT-P1000_CARD',
'A70S', 'A101IT', '7', 'INCREDIBLE', 'A7EB', 'SGH-T849_CARD',
@ -251,7 +252,9 @@ class ANDROID(USBMS):
'FILE-CD_GADGET', 'GT-I9001_CARD', 'USB_2.0', 'XT875',
'UMS_COMPOSITE', 'PRO', '.KOBO_VOX', 'SGH-T989_CARD', 'SGH-I727',
'USB_FLASH_DRIVER', 'ANDROID', 'MID7042', '7035', 'VIEWPAD_7E',
'NOVO7', 'ADVANCED', 'TABLET_PC', 'F', 'E400_SD_CARD', 'ST80208-1', 'XT894']
'NOVO7', 'ADVANCED', 'TABLET_PC', 'F', 'E400_SD_CARD', 'ST80208-1', 'XT894',
'_USB',
]
OSX_MAIN_MEM = 'Android Device Main Memory'

View File

@ -107,6 +107,12 @@ class DevicePlugin(Plugin):
#: :meth:`set_user_blacklisted_devices`
ASK_TO_ALLOW_CONNECT = False
#: Set this to a dictionary of the form {'title':title, 'msg':msg, 'det_msg':detailed_msg} to have calibre popup
#: a message to the user after some callbacks are run (currently only upload_books).
#: Be careful to not spam the user with too many messages. This variable is checked after *every* callback,
#: so only set it when you really need to.
user_feedback_after_callback = None
@classmethod
def get_gui_name(cls):
if hasattr(cls, 'gui_name'):
@ -157,16 +163,15 @@ class DevicePlugin(Plugin):
if (vid in device_id or vidd in device_id) and \
(pid in device_id or pidd in device_id) and \
self.test_bcd_windows(device_id, bcd):
if debug:
self.print_usb_device_info(device_id)
if only_presence or self.can_handle_windows(device_id, debug=debug):
try:
bcd = int(device_id.rpartition(
'rev_')[-1].replace(':', 'a'), 16)
except:
bcd = None
return True, (vendor_id, product_id, bcd, None,
None, None)
if debug:
self.print_usb_device_info(device_id)
if only_presence or self.can_handle_windows(device_id, debug=debug):
try:
bcd = int(device_id.rpartition(
'rev_')[-1].replace(':', 'a'), 16)
except:
bcd = None
return True, (vendor_id, product_id, bcd, None, None, None)
return False, None
def test_bcd(self, bcdDevice, bcd):
@ -638,7 +643,6 @@ class DevicePlugin(Plugin):
'''
device_prefs.set_overrides()
# Dynamic control interface.
# The following methods are probably called on the GUI thread. Any driver
# that implements these methods must take pains to be thread safe, because

View File

@ -39,8 +39,8 @@ class PRST1(USBMS):
path_sep = '/'
booklist_class = CollectionsBookList
FORMATS = ['epub', 'pdf', 'txt', 'book', 'zbf'] # The last two are
# used in japan
FORMATS = ['epub', 'pdf', 'txt', 'book', 'zbf'] # The last two are
# used in japan
CAN_SET_METADATA = ['collections']
CAN_DO_DEVICE_DB_PLUGBOARD = True
@ -50,10 +50,10 @@ class PRST1(USBMS):
VENDOR_NAME = 'SONY'
WINDOWS_MAIN_MEM = re.compile(
r'(PRS-T(1|2)&)'
r'(PRS-T(1|2|2N)&)'
)
WINDOWS_CARD_A_MEM = re.compile(
r'(PRS-T(1|2)__SD&)'
r'(PRS-T(1|2|2N)__SD&)'
)
MAIN_MEMORY_VOLUME_LABEL = 'SONY Reader Main Memory'
STORAGE_CARD_VOLUME_LABEL = 'SONY Reader Storage Card'
@ -66,7 +66,7 @@ class PRST1(USBMS):
EXTRA_CUSTOMIZATION_MESSAGE = [
_('Comma separated list of metadata fields '
'to turn into collections on the device. Possibilities include: ')+\
'to turn into collections on the device. Possibilities include: ')+
'series, tags, authors',
_('Upload separate cover thumbnails for books') +
':::'+_('Normally, the SONY readers get the cover image from the'
@ -194,17 +194,17 @@ class PRST1(USBMS):
time_offsets = {}
for i, row in enumerate(cursor):
try:
comp_date = int(os.path.getmtime(self.normalize_path(prefix + row[0])) * 1000);
comp_date = int(os.path.getmtime(self.normalize_path(prefix + row[0])) * 1000)
except (OSError, IOError, TypeError):
# In case the db has incorrect path info
continue
device_date = int(row[1]);
device_date = int(row[1])
offset = device_date - comp_date
time_offsets.setdefault(offset, 0)
time_offsets[offset] = time_offsets[offset] + 1
try:
device_offset = max(time_offsets,key = lambda a: time_offsets.get(a))
device_offset = max(time_offsets, key=lambda a: time_offsets.get(a))
debug_print("Device Offset: %d ms"%device_offset)
self.device_offset = device_offset
except ValueError:
@ -213,7 +213,7 @@ class PRST1(USBMS):
for idx, book in enumerate(bl):
query = 'SELECT _id, thumbnail FROM books WHERE file_path = ?'
t = (book.lpath,)
cursor.execute (query, t)
cursor.execute(query, t)
for i, row in enumerate(cursor):
book.device_collections = bl_collections.get(row[0], None)
@ -318,14 +318,14 @@ class PRST1(USBMS):
' any notes/highlights, etc.')%dbpath)+' Underlying error:'
'\n'+tb)
def get_lastrowid(self, cursor):
# SQLite3 + Python has a fun issue on 32-bit systems with integer overflows.
# Issue a SQL query instead, getting the value as a string, and then converting to a long python int manually.
query = 'SELECT last_insert_rowid()'
cursor.execute(query)
row = cursor.fetchone()
def get_lastrowid(self, cursor):
# SQLite3 + Python has a fun issue on 32-bit systems with integer overflows.
# Issue a SQL query instead, getting the value as a string, and then converting to a long python int manually.
query = 'SELECT last_insert_rowid()'
cursor.execute(query)
row = cursor.fetchone()
return long(row[0])
return long(row[0])
def get_database_min_id(self, source_id):
sequence_min = 0L
@ -345,7 +345,7 @@ class PRST1(USBMS):
# Insert the sequence Id if it doesn't
query = ('INSERT INTO sqlite_sequence (name, seq) '
'SELECT ?, ? '
'WHERE NOT EXISTS (SELECT 1 FROM sqlite_sequence WHERE name = ?)');
'WHERE NOT EXISTS (SELECT 1 FROM sqlite_sequence WHERE name = ?)')
cursor.execute(query, (table, sequence_id, table,))
cursor.close()

View File

@ -77,7 +77,7 @@ class Plumber(object):
def __init__(self, input, output, log, report_progress=DummyReporter(),
dummy=False, merge_plugin_recs=True, abort_after_input_dump=False,
override_input_metadata=False):
override_input_metadata=False, for_regex_wizard=False):
'''
:param input: Path to input file.
:param output: Path to output file/directory
@ -87,6 +87,7 @@ class Plumber(object):
if isbytestring(output):
output = output.decode(filesystem_encoding)
self.original_input_arg = input
self.for_regex_wizard = for_regex_wizard
self.input = os.path.abspath(input)
self.output = os.path.abspath(output)
self.log = log
@ -123,7 +124,7 @@ OptionRecommendation(name='input_profile',
'conversion system information on how to interpret '
'various information in the input document. For '
'example resolution dependent lengths (i.e. lengths in '
'pixels). Choices are:')+\
'pixels). Choices are:')+
', '.join([x.short_name for x in input_profiles()])
),
@ -135,7 +136,7 @@ OptionRecommendation(name='output_profile',
'created document for the specified device. In some cases, '
'an output profile is required to produce documents that '
'will work on a device. For example EPUB on the SONY reader. '
'Choices are:') + \
'Choices are:') +
', '.join([x.short_name for x in output_profiles()])
),
@ -490,7 +491,7 @@ OptionRecommendation(name='asciiize',
'cases where there are multiple representations of a character '
'(characters shared by Chinese and Japanese for instance) the '
'representation based on the current calibre interface language will be '
'used.')%\
'used.')%
u'\u041c\u0438\u0445\u0430\u0438\u043b '
u'\u0413\u043e\u0440\u0431\u0430\u0447\u0451\u0432'
)
@ -711,7 +712,6 @@ OptionRecommendation(name='search_replace',
self.input_fmt = input_fmt
self.output_fmt = output_fmt
self.all_format_options = set()
self.input_options = set()
self.output_options = set()
@ -775,7 +775,7 @@ OptionRecommendation(name='search_replace',
if not html_files:
raise ValueError(_('Could not find an ebook inside the archive'))
html_files = [(f, os.stat(f).st_size) for f in html_files]
html_files.sort(cmp = lambda x, y: cmp(x[1], y[1]))
html_files.sort(cmp=lambda x, y: cmp(x[1], y[1]))
html_files = [f[0] for f in html_files]
for q in ('toc', 'index'):
for f in html_files:
@ -783,8 +783,6 @@ OptionRecommendation(name='search_replace',
return f, os.path.splitext(f)[1].lower()[1:]
return html_files[-1], os.path.splitext(html_files[-1])[1].lower()[1:]
def get_option_by_name(self, name):
for group in (self.input_options, self.pipeline_options,
self.output_options, self.all_format_options):
@ -956,7 +954,6 @@ OptionRecommendation(name='search_replace',
self.log.info('Input debug saved to:', out_dir)
def run(self):
'''
Run the conversion pipeline
@ -965,10 +962,12 @@ OptionRecommendation(name='search_replace',
self.setup_options()
if self.opts.verbose:
self.log.filter_level = self.log.DEBUG
if self.for_regex_wizard and hasattr(self.opts, 'no_process'):
self.opts.no_process = True
self.flush()
import cssutils, logging
cssutils.log.setLevel(logging.WARN)
get_types_map() # Ensure the mimetypes module is intialized
get_types_map() # Ensure the mimetypes module is intialized
if self.opts.debug_pipeline is not None:
self.opts.verbose = max(self.opts.verbose, 4)
@ -1003,6 +1002,8 @@ OptionRecommendation(name='search_replace',
self.ui_reporter(0.01, _('Converting input to HTML...'))
ir = CompositeProgressReporter(0.01, 0.34, self.ui_reporter)
self.input_plugin.report_progress = ir
if self.for_regex_wizard:
self.input_plugin.for_viewer = True
with self.input_plugin:
self.oeb = self.input_plugin(stream, self.opts,
self.input_fmt, self.log,
@ -1014,8 +1015,12 @@ OptionRecommendation(name='search_replace',
if self.input_fmt in ('recipe', 'downloaded_recipe'):
self.opts_to_mi(self.user_metadata)
if not hasattr(self.oeb, 'manifest'):
self.oeb = create_oebbook(self.log, self.oeb, self.opts,
encoding=self.input_plugin.output_encoding)
self.oeb = create_oebbook(
self.log, self.oeb, self.opts,
encoding=self.input_plugin.output_encoding,
for_regex_wizard=self.for_regex_wizard)
if self.for_regex_wizard:
return
self.input_plugin.postprocess_book(self.oeb, self.opts, self.log)
self.opts.is_image_collection = self.input_plugin.is_image_collection
pr = CompositeProgressReporter(0.34, 0.67, self.ui_reporter)
@ -1081,7 +1086,6 @@ OptionRecommendation(name='search_replace',
self.dump_oeb(self.oeb, out_dir)
self.log('Structured HTML written to:', out_dir)
if self.opts.extra_css and os.path.exists(self.opts.extra_css):
self.opts.extra_css = open(self.opts.extra_css, 'rb').read()
@ -1161,13 +1165,20 @@ OptionRecommendation(name='search_replace',
self.log(self.output_fmt.upper(), 'output written to', self.output)
self.flush()
# This has to be global as create_oebbook can be called from other locations
# (for example in the html input plugin)
regex_wizard_callback = None
def set_regex_wizard_callback(f):
global regex_wizard_callback
regex_wizard_callback = f
def create_oebbook(log, path_or_stream, opts, reader=None,
encoding='utf-8', populate=True):
encoding='utf-8', populate=True, for_regex_wizard=False):
'''
Create an OEBBook.
'''
from calibre.ebooks.oeb.base import OEBBook
html_preprocessor = HTMLPreProcessor(log, opts)
html_preprocessor = HTMLPreProcessor(log, opts, regex_wizard_callback=regex_wizard_callback)
if not encoding:
encoding = None
oeb = OEBBook(log, html_preprocessor,
@ -1182,3 +1193,4 @@ def create_oebbook(log, path_or_stream, opts, reader=None,
reader()(oeb, path_or_stream)
return oeb

View File

@ -14,7 +14,7 @@ SVG_NS = 'http://www.w3.org/2000/svg'
XLINK_NS = 'http://www.w3.org/1999/xlink'
convert_entities = functools.partial(entity_to_unicode,
result_exceptions = {
result_exceptions={
u'<' : '&lt;',
u'>' : '&gt;',
u"'" : '&apos;',
@ -144,9 +144,9 @@ class DocAnalysis(object):
percent is the percentage of lines that should be in a single bucket to return true
The majority of the lines will exist in 1-2 buckets in typical docs with hard line breaks
'''
minLineLength=20 # Ignore lines under 20 chars (typical of spaces)
maxLineLength=1900 # Discard larger than this to stay in range
buckets=20 # Each line is divided into a bucket based on length
minLineLength=20 # Ignore lines under 20 chars (typical of spaces)
maxLineLength=1900 # Discard larger than this to stay in range
buckets=20 # Each line is divided into a bucket based on length
#print "there are "+str(len(lines))+" lines"
#max = 0
@ -156,7 +156,7 @@ class DocAnalysis(object):
# max = l
#print "max line found is "+str(max)
# Build the line length histogram
hRaw = [ 0 for i in range(0,buckets) ]
hRaw = [0 for i in range(0,buckets)]
for line in self.lines:
l = len(line)
if l > minLineLength and l < maxLineLength:
@ -167,7 +167,7 @@ class DocAnalysis(object):
# Normalize the histogram into percents
totalLines = len(self.lines)
if totalLines > 0:
h = [ float(count)/totalLines for count in hRaw ]
h = [float(count)/totalLines for count in hRaw]
else:
h = []
#print "\nhRaw histogram lengths are: "+str(hRaw)
@ -200,7 +200,7 @@ class Dehyphenator(object):
# Add common suffixes to the regex below to increase the likelihood of a match -
# don't add suffixes which are also complete words, such as 'able' or 'sex'
# only remove if it's not already the point of hyphenation
self.suffix_string = "((ed)?ly|'?e?s||a?(t|s)?ion(s|al(ly)?)?|ings?|er|(i)?ous|(i|a)ty|(it)?ies|ive|gence|istic(ally)?|(e|a)nce|m?ents?|ism|ated|(e|u)ct(ed)?|ed|(i|ed)?ness|(e|a)ncy|ble|ier|al|ex|ian)$"
self.suffix_string = "((ed)?ly|'?e?s||a?(t|s)?ion(s|al(ly)?)?|ings?|er|(i)?ous|(i|a)ty|(it)?ies|ive|gence|istic(ally)?|(e|a)nce|m?ents?|ism|ated|(e|u)ct(ed)?|ed|(i|ed)?ness|(e|a)ncy|ble|ier|al|ex|ian)$" # noqa
self.suffixes = re.compile(r"^%s" % self.suffix_string, re.IGNORECASE)
self.removesuffixes = re.compile(r"%s" % self.suffix_string, re.IGNORECASE)
# remove prefixes if the prefix was not already the point of hyphenation
@ -265,19 +265,18 @@ class Dehyphenator(object):
self.html = html
self.format = format
if format == 'html':
intextmatch = re.compile(u'(?<=.{%i})(?P<firstpart>[^\W\-]+)(-|)\s*(?=<)(?P<wraptags>(</span>)?\s*(</[iubp]>\s*){1,2}(?P<up2threeblanks><(p|div)[^>]*>\s*(<p[^>]*>\s*</p>\s*)?</(p|div)>\s+){0,3}\s*(<[iubp][^>]*>\s*){1,2}(<span[^>]*>)?)\s*(?P<secondpart>[\w\d]+)' % length)
intextmatch = re.compile(u'(?<=.{%i})(?P<firstpart>[^\W\-]+)(-|)\s*(?=<)(?P<wraptags>(</span>)?\s*(</[iubp]>\s*){1,2}(?P<up2threeblanks><(p|div)[^>]*>\s*(<p[^>]*>\s*</p>\s*)?</(p|div)>\s+){0,3}\s*(<[iubp][^>]*>\s*){1,2}(<span[^>]*>)?)\s*(?P<secondpart>[\w\d]+)' % length) # noqa
elif format == 'pdf':
intextmatch = re.compile(u'(?<=.{%i})(?P<firstpart>[^\W\-]+)(-|)\s*(?P<wraptags><p>|</[iub]>\s*<p>\s*<[iub]>)\s*(?P<secondpart>[\w\d]+)'% length)
elif format == 'txt':
intextmatch = re.compile(u'(?<=.{%i})(?P<firstpart>[^\W\-]+)(-|)(\u0020|\u0009)*(?P<wraptags>(\n(\u0020|\u0009)*)+)(?P<secondpart>[\w\d]+)'% length)
intextmatch = re.compile(u'(?<=.{%i})(?P<firstpart>[^\W\-]+)(-|)(\u0020|\u0009)*(?P<wraptags>(\n(\u0020|\u0009)*)+)(?P<secondpart>[\w\d]+)'% length) # noqa
elif format == 'individual_words':
intextmatch = re.compile(u'(?!<)(?P<firstpart>[^\W\-]+)(-|)\s*(?P<secondpart>\w+)(?![^<]*?>)')
elif format == 'html_cleanup':
intextmatch = re.compile(u'(?P<firstpart>[^\W\-]+)(-|)\s*(?=<)(?P<wraptags></span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?P<secondpart>[\w\d]+)')
intextmatch = re.compile(u'(?P<firstpart>[^\W\-]+)(-|)\s*(?=<)(?P<wraptags></span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?P<secondpart>[\w\d]+)') # noqa
elif format == 'txt_cleanup':
intextmatch = re.compile(u'(?P<firstpart>[^\W\-]+)(-|)(?P<wraptags>\s+)(?P<secondpart>[\w\d]+)')
html = intextmatch.sub(self.dehyphenate, html)
return html
@ -498,9 +497,11 @@ class HTMLPreProcessor(object):
(re.compile('<span[^><]*?id=subtitle[^><]*?>(.*?)</span>', re.IGNORECASE|re.DOTALL),
lambda match : '<h3 class="subtitle">%s</h3>'%(match.group(1),)),
]
def __init__(self, log=None, extra_opts=None):
def __init__(self, log=None, extra_opts=None, regex_wizard_callback=None):
self.log = log
self.extra_opts = extra_opts
self.regex_wizard_callback = regex_wizard_callback
self.current_href = None
def is_baen(self, src):
return re.compile(r'<meta\s+name="Publisher"\s+content=".*?Baen.*?"',
@ -581,12 +582,15 @@ class HTMLPreProcessor(object):
end_rules.append((re.compile(u'(?<=.{%i}[–—])\s*<p>\s*(?=[[a-z\d])' % length), lambda match: ''))
end_rules.append(
# Un wrap using punctuation
(re.compile(u'(?<=.{%i}([a-zäëïöüàèìòùáćéíĺóŕńśúýâêîôûçąężıãõñæøþðßěľščťžňďřů,:)\IA\u00DF]|(?<!\&\w{4});))\s*(?P<ital></(i|b|u)>)?\s*(</p>\s*<p>\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines),
(re.compile(u'(?<=.{%i}([a-zäëïöüàèìòùáćéíĺóŕńśúýâêîôûçąężıãõñæøþðßěľščťžňďřů,:)\IA\u00DF]|(?<!\&\w{4});))\s*(?P<ital></(i|b|u)>)?\s*(</p>\s*<p>\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines), # noqa
)
for rule in self.PREPROCESS + start_rules:
html = rule[0].sub(rule[1], html)
if self.regex_wizard_callback is not None:
self.regex_wizard_callback(self.current_href, html)
if get_preprocess_html:
return html

View File

@ -175,6 +175,20 @@ def read_shd(parent, dest):
if val:
ans = simple_color(val, auto='transparent')
setattr(dest, 'background_color', ans)
def read_numbering(parent, dest):
lvl = num_id = None
for np in XPath('./w:numPr')(parent):
for ilvl in XPath('./w:ilvl[@w:val]')(np):
try:
lvl = int(get(ilvl, 'w:val'))
except (ValueError, TypeError):
pass
for num in XPath('./w:numId[@w:val]')(np):
num_id = get(num, 'w:val')
val = (num_id, lvl) if num_id is not None or lvl is not None else inherit
setattr(dest, 'numbering', val)
# }}}
class ParagraphStyle(object):
@ -194,6 +208,7 @@ class ParagraphStyle(object):
# Misc.
'text_indent', 'text_align', 'line_height', 'direction', 'background_color',
'numbering', 'font_family', 'font_size',
)
def __init__(self, pPr=None):
@ -210,13 +225,15 @@ class ParagraphStyle(object):
):
setattr(self, p, binary_property(pPr, p))
for x in ('border', 'indent', 'justification', 'spacing', 'direction', 'shd'):
for x in ('border', 'indent', 'justification', 'spacing', 'direction', 'shd', 'numbering'):
f = globals()['read_%s' % x]
f(pPr, self)
for s in XPath('./w:pStyle[@w:val]')(pPr):
self.linked_style = get(s, 'w:val')
self.font_family = self.font_size = inherit
self._css = None
def update(self, other):
@ -256,10 +273,16 @@ class ParagraphStyle(object):
if val is not inherit:
c['margin-%s' % edge] = val
for x in ('text_indent', 'text_align', 'line_height', 'background_color'):
if self.line_height not in {inherit, '1'}:
c['line-height'] = self.line_height
for x in ('text_indent', 'text_align', 'background_color', 'font_family', 'font_size'):
val = getattr(self, x)
if val is not inherit:
if x == 'font_size':
val = '%.3gpt' % val
c[x.replace('_', '-')] = val
return self._css
# TODO: keepNext must be done at markup level

View File

@ -113,6 +113,14 @@ def read_vert_align(parent, dest):
if val and val in {'baseline', 'subscript', 'superscript'}:
ans = val
setattr(dest, 'vert_align', ans)
def read_font_family(parent, dest):
ans = inherit
for col in XPath('./w:rFonts[@w:ascii]')(parent):
val = get(col, 'w:ascii')
if val:
ans = val
setattr(dest, 'font_family', ans)
# }}}
class RunStyle(object):
@ -122,7 +130,7 @@ class RunStyle(object):
'rtl', 'shadow', 'smallCaps', 'strike', 'vanish',
'border_color', 'border_style', 'border_width', 'padding', 'color', 'highlight', 'background_color',
'letter_spacing', 'font_size', 'text_decoration', 'vert_align', 'lang',
'letter_spacing', 'font_size', 'text_decoration', 'vert_align', 'lang', 'font_family'
}
toggle_properties = {
@ -141,7 +149,7 @@ class RunStyle(object):
):
setattr(self, p, binary_property(rPr, p))
for x in ('text_border', 'color', 'highlight', 'shd', 'letter_spacing', 'sz', 'underline', 'vert_align', 'lang'):
for x in ('text_border', 'color', 'highlight', 'shd', 'letter_spacing', 'sz', 'underline', 'vert_align', 'lang', 'font_family'):
f = globals()['read_%s' % x]
f(rPr, self)
@ -164,6 +172,18 @@ class RunStyle(object):
if val is inherit:
setattr(self, p, getattr(parent, p))
def get_border_css(self, ans):
for x in ('color', 'style', 'width'):
val = getattr(self, 'border_'+x)
if x == 'width' and val is not inherit:
val = '%.3gpt' % val
if val is not inherit:
ans['border-%s' % x] = val
def clear_border_css(self):
for x in ('color', 'style', 'width'):
setattr(self, 'border_'+x, inherit)
@property
def css(self):
if self._css is None:
@ -188,12 +208,7 @@ class RunStyle(object):
if self.vanish is True:
c['display'] = 'none'
for x in ('color', 'style', 'width'):
val = getattr(self, 'border_'+x)
if x == 'width' and val is not inherit:
val = '%.3gpt' % val
if val is not inherit:
c['border-%s' % x] = val
self.get_border_css(c)
if self.padding is not inherit:
c['padding'] = '%.3gpt' % self.padding
@ -212,6 +227,10 @@ class RunStyle(object):
if self.b:
c['font-weight'] = 'bold'
if self.font_family is not inherit:
c['font-family'] = self.font_family
return self._css
def same_border(self, other):

View File

@ -167,7 +167,9 @@ class DOCX(object):
@property
def document_relationships(self):
name = self.document_name
return self.get_relationships(self.document_name)
def get_relationships(self, name):
base = '/'.join(name.split('/')[:-1])
by_id, by_type = {}, {}
parts = name.split('/')

View File

@ -22,7 +22,7 @@ def dump(path):
zf.extractall(dest)
for f in walk(dest):
if f.endswith('.xml'):
if f.endswith('.xml') or f.endswith('.rels'):
with open(f, 'r+b') as stream:
raw = stream.read()
root = etree.fromstring(raw)

View File

@ -0,0 +1,132 @@
#!/usr/bin/env python
# vim:fileencoding=utf-8
from __future__ import (unicode_literals, division, absolute_import,
print_function)
__license__ = 'GPL v3'
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
import os, re
from collections import namedtuple
from calibre.ebooks.docx.block_styles import binary_property, inherit
from calibre.ebooks.docx.names import XPath, get
from calibre.utils.filenames import ascii_filename
from calibre.utils.fonts.scanner import font_scanner, NoFonts
from calibre.utils.fonts.utils import panose_to_css_generic_family, is_truetype_font
Embed = namedtuple('Embed', 'name key subsetted')
def has_system_fonts(name):
try:
return bool(font_scanner.fonts_for_family(name))
except NoFonts:
return False
def get_variant(bold=False, italic=False):
return {(False, False):'Regular', (False, True):'Italic',
(True, False):'Bold', (True, True):'BoldItalic'}[(bold, italic)]
class Family(object):
def __init__(self, elem, embed_relationships):
self.name = self.family_name = get(elem, 'w:name')
self.alt_names = tuple(get(x, 'w:val') for x in XPath('./w:altName')(elem))
if self.alt_names and not has_system_fonts(self.name):
for x in self.alt_names:
if has_system_fonts(x):
self.family_name = x
break
self.embedded = {}
for x in ('Regular', 'Bold', 'Italic', 'BoldItalic'):
for y in XPath('./w:embed%s[@r:id]' % x)(elem):
rid = get(y, 'r:id')
key = get(y, 'w:fontKey')
subsetted = get(y, 'w:subsetted') in {'1', 'true', 'on'}
if rid in embed_relationships:
self.embedded[x] = Embed(embed_relationships[rid], key, subsetted)
self.generic_family = 'auto'
for x in XPath('./w:family[@w:val]')(elem):
self.generic_family = get(x, 'w:val', 'auto')
ntt = binary_property(elem, 'notTrueType')
self.is_ttf = ntt is inherit or not ntt
self.panose1 = None
self.panose_name = None
for x in XPath('./w:panose1[@w:val]')(elem):
try:
v = get(x, 'w:val')
v = tuple(int(v[i:i+2], 16) for i in xrange(0, len(v), 2))
except (TypeError, ValueError, IndexError):
pass
else:
self.panose1 = v
self.panose_name = panose_to_css_generic_family(v)
self.css_generic_family = {'roman':'serif', 'swiss':'sans-serif', 'modern':'monospace',
'decorative':'fantasy', 'script':'cursive'}.get(self.generic_family, None)
self.css_generic_family = self.css_generic_family or self.panose_name or 'serif'
class Fonts(object):
def __init__(self):
self.fonts = {}
self.used = set()
def __call__(self, root, embed_relationships, docx, dest_dir):
for elem in XPath('//w:font[@w:name]')(root):
self.fonts[get(elem, 'w:name')] = Family(elem, embed_relationships)
def family_for(self, name, bold=False, italic=False):
f = self.fonts.get(name, None)
if f is None:
return 'serif'
variant = get_variant(bold, italic)
self.used.add((name, variant))
name = f.name if variant in f.embedded else f.family_name
return '"%s", %s' % (name.replace('"', ''), f.css_generic_family)
def embed_fonts(self, dest_dir, docx):
defs = []
dest_dir = os.path.join(dest_dir, 'fonts')
for name, variant in self.used:
f = self.fonts[name]
if variant in f.embedded:
if not os.path.exists(dest_dir):
os.mkdir(dest_dir)
fname = self.write(name, dest_dir, docx, variant)
if fname is not None:
d = {'font-family':'"%s"' % name.replace('"', ''), 'src': 'url("fonts/%s")' % fname}
if 'Bold' in variant:
d['font-weight'] = 'bold'
if 'Italic' in variant:
d['font-style'] = 'italic'
d = ['%s: %s' % (k, v) for k, v in d.iteritems()]
d = ';\n\t'.join(d)
defs.append('@font-face {\n\t%s\n}\n' % d)
return '\n'.join(defs)
def write(self, name, dest_dir, docx, variant):
f = self.fonts[name]
ef = f.embedded[variant]
raw = docx.read(ef.name)
prefix = raw[:32]
if ef.key:
key = re.sub(r'[^A-Fa-f0-9]', '', ef.key)
key = bytearray(reversed(tuple(int(key[i:i+2], 16) for i in xrange(0, len(key), 2))))
prefix = bytearray(prefix)
prefix = bytes(bytearray(prefix[i]^key[i % len(key)] for i in xrange(len(prefix))))
if not is_truetype_font(prefix):
return None
ext = 'otf' if prefix.startswith(b'OTTO') else 'ttf'
fname = ascii_filename('%s - %s.%s' % (name, variant, ext))
with open(os.path.join(dest_dir, fname), 'wb') as dest:
dest.write(prefix)
dest.write(raw[32:])
return fname

View File

@ -13,6 +13,7 @@ DOCPROPS = 'http://schemas.openxmlformats.org/package/2006/relationships/metada
APPPROPS = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/extended-properties'
STYLES = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/styles'
NUMBERING = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/numbering'
FONTS = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/fontTable'
namespaces = {
'mo': 'http://schemas.microsoft.com/office/mac/office/2008/main',

View File

@ -6,6 +6,11 @@ from __future__ import (unicode_literals, division, absolute_import,
__license__ = 'GPL v3'
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
import re
from collections import Counter
from lxml.html.builder import OL, UL, SPAN
from calibre.ebooks.docx.block_styles import ParagraphStyle
from calibre.ebooks.docx.char_styles import RunStyle
from calibre.ebooks.docx.names import XPath, get
@ -33,10 +38,26 @@ class Level(object):
self.fmt = 'decimal'
self.para_link = None
self.paragraph_style = self.character_style = None
self.is_numbered = False
self.num_template = None
if lvl is not None:
self.read_from_xml(lvl)
def copy(self):
ans = Level()
for x in ('restart', 'start', 'fmt', 'para_link', 'paragraph_style', 'character_style', 'is_numbered', 'num_template'):
setattr(ans, x, getattr(self, x))
return ans
def format_template(self, counter, ilvl):
def sub(m):
x = int(m.group(1)) - 1
if x > ilvl or x not in counter:
return ''
return '%d' % (counter[x] - (0 if x == ilvl else 1))
return re.sub(r'%(\d+)', sub, self.num_template).rstrip() + '\xa0'
def read_from_xml(self, lvl, override=False):
for lr in XPath('./w:lvlRestart[@w:val]')(lvl):
try:
@ -57,9 +78,13 @@ class Level(object):
for lr in XPath('./w:numFmt[@w:val]')(lvl):
val = get(lr, 'w:val')
if val == 'bullet':
self.is_numbered = False
self.fmt = {'\uf0a7':'square', 'o':'circle'}.get(lt, 'disc')
else:
self.is_numbered = True
self.fmt = STYLE_MAP.get(val, 'decimal')
if lt and re.match(r'%\d+\.$', lt) is None:
self.num_template = lt
for lr in XPath('./w:pStyle[@w:val]')(lvl):
self.para_link = get(lr, 'w:val')
@ -78,12 +103,6 @@ class Level(object):
else:
self.character_style.update(ps)
def copy(self):
ans = Level()
for x in ('restart', 'start', 'fmt', 'para_link', 'paragraph_style', 'character_style'):
setattr(ans, x, getattr(self, x))
return ans
class NumberingDefinition(object):
def __init__(self, parent=None):
@ -107,6 +126,7 @@ class Numbering(object):
def __init__(self):
self.definitions = {}
self.instances = {}
self.counters = {}
def __call__(self, root, styles):
' Read all numbering style definitions '
@ -131,6 +151,7 @@ class Numbering(object):
if alvl is None:
alvl = Level()
alvl.read_from_xml(lvl, override=True)
return nd
next_pass = {}
for n in XPath('./w:num[@w:numId]')(root):
@ -154,3 +175,114 @@ class Numbering(object):
if d is not None:
self.instances[num_id] = create_instance(n, d)
for num_id, d in self.instances.iteritems():
self.counters[num_id] = Counter({lvl:d.levels[lvl].start for lvl in d.levels})
def get_pstyle(self, num_id, style_id):
d = self.instances.get(num_id, None)
if d is not None:
for ilvl, lvl in d.levels.iteritems():
if lvl.para_link == style_id:
return ilvl
def get_para_style(self, num_id, lvl):
d = self.instances.get(num_id, None)
if d is not None:
lvl = d.levels.get(lvl, None)
return getattr(lvl, 'paragraph_style', None)
def update_counter(self, counter, levelnum, levels):
counter[levelnum] += 1
for ilvl, lvl in levels.iteritems():
restart = lvl.restart
if (restart is None and ilvl == levelnum + 1) or restart == levelnum + 1:
counter[ilvl] = lvl.start
def apply_markup(self, items, body, styles, object_map):
for p, num_id, ilvl in items:
d = self.instances.get(num_id, None)
if d is not None:
lvl = d.levels.get(ilvl, None)
if lvl is not None:
counter = self.counters[num_id]
p.tag = 'li'
p.set('value', '%s' % counter[ilvl])
p.set('list-lvl', str(ilvl))
p.set('list-id', num_id)
if lvl.num_template is not None:
val = lvl.format_template(counter, ilvl)
p.set('list-template', val)
self.update_counter(counter, ilvl, d.levels)
def commit(current_run):
if not current_run:
return
start = current_run[0]
parent = start.getparent()
idx = parent.index(start)
d = self.instances[start.get('list-id')]
ilvl = int(start.get('list-lvl'))
lvl = d.levels[ilvl]
lvlid = start.get('list-id') + start.get('list-lvl')
wrap = (OL if lvl.is_numbered else UL)('\n\t')
has_template = 'list-template' in start.attrib
if has_template:
wrap.set('lvlid', lvlid)
else:
wrap.set('class', styles.register({'list-style-type': lvl.fmt}, 'list'))
parent.insert(idx, wrap)
last_val = None
for child in current_run:
wrap.append(child)
child.tail = '\n\t'
if has_template:
span = SPAN()
span.text = child.text
child.text = None
for gc in child:
span.append(gc)
child.append(span)
span = SPAN(child.get('list-template'))
child.insert(0, span)
for attr in ('list-lvl', 'list-id', 'list-template'):
child.attrib.pop(attr, None)
val = int(child.get('value'))
if last_val == val - 1 or wrap.tag == 'ul':
child.attrib.pop('value')
last_val = val
current_run[-1].tail = '\n'
del current_run[:]
parents = set()
for child in body.iterdescendants('li'):
parents.add(child.getparent())
for parent in parents:
current_run = []
for child in parent:
if child.tag == 'li':
if current_run:
last = current_run[-1]
if (last.get('list-id') , last.get('list-lvl')) != (child.get('list-id'), child.get('list-lvl')):
commit(current_run)
current_run.append(child)
else:
commit(current_run)
commit(current_run)
for wrap in body.xpath('//ol[@lvlid]'):
wrap.attrib.pop('lvlid')
wrap.tag = 'div'
for i, li in enumerate(wrap.iterchildren('li')):
li.tag = 'div'
li.attrib.pop('value', None)
li.set('style', 'display:table-row')
obj = object_map[li]
bs = styles.para_cache[obj]
if i == 0:
wrap.set('style', 'display:table; margin-left: %s' % (bs.css.get('margin-left', 0)))
bs.css.pop('margin-left', None)
for child in li:
child.set('style', 'display:table-cell')

View File

@ -97,7 +97,8 @@ class Styles(object):
def get(self, key, default=None):
return self.id_map.get(key, default)
def __call__(self, root):
def __call__(self, root, fonts):
self.fonts = fonts
for s in XPath('//w:style')(root):
s = Style(s)
if s.style_id:
@ -198,8 +199,19 @@ class Styles(object):
if default_para.character_style is not None:
self.para_char_cache[p] = default_para.character_style
is_numbering = direct_formatting.numbering is not inherit
if is_numbering:
num_id, lvl = direct_formatting.numbering
if num_id is not None:
p.set('calibre_num_id', '%s:%s' % (lvl, num_id))
if num_id is not None and lvl is not None:
ps = self.numbering.get_para_style(num_id, lvl)
if ps is not None:
parent_styles.append(ps)
for attr in ans.all_properties:
setattr(ans, attr, self.para_val(parent_styles, direct_formatting, attr))
if not (is_numbering and attr == 'text_indent'): # skip text-indent for lists
setattr(ans, attr, self.para_val(parent_styles, direct_formatting, attr))
return ans
def resolve_run(self, r):
@ -235,6 +247,9 @@ class Styles(object):
for attr in ans.all_properties:
setattr(ans, attr, self.run_val(parent_styles, direct_formatting, attr))
if ans.font_family is not inherit:
ans.font_family = self.fonts.family_for(ans.font_family, ans.b, ans.i)
return ans
def resolve(self, obj):
@ -243,11 +258,70 @@ class Styles(object):
if obj.tag.endswith('}r'):
return self.resolve_run(obj)
def cascade(self, layers):
self.body_font_family = 'serif'
self.body_font_size = '10pt'
for p, runs in layers.iteritems():
char_styles = [self.resolve_run(r) for r in runs]
block_style = self.resolve_paragraph(p)
c = Counter()
for s in char_styles:
if s.font_family is not inherit:
c[s.font_family] += 1
if c:
family = c.most_common(1)[0][0]
block_style.font_family = family
for s in char_styles:
if s.font_family == family:
s.font_family = inherit
sizes = [s.font_size for s in char_styles if s.font_size is not inherit]
if sizes:
sz = block_style.font_size = sizes[0]
for s in char_styles:
if s.font_size == sz:
s.font_size = inherit
block_styles = [self.resolve_paragraph(p) for p in layers]
c = Counter()
for s in block_styles:
if s.font_family is not inherit:
c[s.font_family] += 1
if c:
self.body_font_family = family = c.most_common(1)[0][0]
for s in block_styles:
if s.font_family == family:
s.font_family = inherit
c = Counter()
for s in block_styles:
if s.font_size is not inherit:
c[s.font_size] += 1
if c:
sz = c.most_common(1)[0][0]
for s in block_styles:
if s.font_size == sz:
s.font_size = inherit
self.body_font_size = '%.3gpt' % sz
def resolve_numbering(self, numbering):
pass # TODO: Implement this
# When a numPr element appears inside a paragraph style, the lvl info
# must be discarder and pStyle used instead.
self.numbering = numbering
for style in self:
ps = style.paragraph_style
if ps is not None and ps.numbering is not inherit:
lvl = numbering.get_pstyle(ps.numbering[0], style.style_id)
if lvl is None:
ps.numbering = inherit
else:
ps.numbering = (ps.numbering[0], lvl)
def register(self, css, prefix):
h = hash(tuple(css.iteritems()))
h = hash(frozenset(css.iteritems()))
ans, _ = self.classes.get(h, (None, None))
if ans is None:
self.counter[prefix] += 1
@ -266,14 +340,21 @@ class Styles(object):
self.register(css, 'text')
def class_name(self, css):
h = hash(tuple(css.iteritems()))
h = hash(frozenset(css.iteritems()))
return self.classes.get(h, (None, None))[0]
def generate_css(self):
def generate_css(self, dest_dir, docx):
ef = self.fonts.embed_fonts(dest_dir, docx)
prefix = textwrap.dedent(
'''\
p { margin: 0; padding: 0; text-indent: 1.5em }
''')
body { font-family: %s; font-size: %s }
p { text-indent: 1.5em }
ul, ol, p { margin: 0; padding: 0 }
''') % (self.body_font_family, self.body_font_size)
if ef:
prefix = ef + '\n' + prefix
ans = []
for (cls, css) in sorted(self.classes.itervalues(), key=lambda x:x[0]):

View File

@ -7,15 +7,17 @@ __license__ = 'GPL v3'
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
import sys, os, re
from collections import OrderedDict
from lxml import html
from lxml.html.builder import (
HTML, HEAD, TITLE, BODY, LINK, META, P, SPAN, BR)
from calibre.ebooks.docx.container import DOCX, fromstring
from calibre.ebooks.docx.names import XPath, is_tag, barename, XML, STYLES, NUMBERING
from calibre.ebooks.docx.names import XPath, is_tag, XML, STYLES, NUMBERING, FONTS
from calibre.ebooks.docx.styles import Styles, inherit
from calibre.ebooks.docx.numbering import Numbering
from calibre.ebooks.docx.fonts import Fonts
from calibre.utils.localization import canonicalize_lang, lang_as_iso639_1
class Text:
@ -36,7 +38,7 @@ class Convert(object):
self.mi = self.docx.metadata
self.body = BODY()
self.styles = Styles()
self.object_map = {}
self.object_map = OrderedDict()
self.html = HTML(
HEAD(
META(charset='utf-8'),
@ -62,16 +64,27 @@ class Convert(object):
doc = self.docx.document
relationships_by_id, relationships_by_type = self.docx.document_relationships
self.read_styles(relationships_by_type)
for top_level in XPath('/w:document/w:body/*')(doc):
if is_tag(top_level, 'w:p'):
p = self.convert_p(top_level)
self.body.append(p)
elif is_tag(top_level, 'w:tbl'):
pass # TODO: tables
elif is_tag(top_level, 'w:sectPr'):
pass # TODO: Last section properties
else:
self.log.debug('Unknown top-level tag: %s, ignoring' % barename(top_level.tag))
self.layers = OrderedDict()
for wp in XPath('//w:p')(doc):
p = self.convert_p(wp)
self.body.append(p)
# TODO: tables <w:tbl> child of <w:body> (nested tables?)
# TODO: Last section properties <w:sectPr> child of <w:body>
self.styles.cascade(self.layers)
numbered = []
for html_obj, obj in self.object_map.iteritems():
raw = obj.get('calibre_num_id', None)
if raw is not None:
lvl, num_id = raw.partition(':')[0::2]
try:
lvl = int(lvl)
except (TypeError, ValueError):
lvl = 0
numbered.append((html_obj, num_id, lvl))
self.numbering.apply_markup(numbered, self.body, self.styles, self.object_map)
if len(self.body) > 0:
self.body.text = '\n\t'
for child in self.body:
@ -102,7 +115,18 @@ class Convert(object):
nname = get_name(NUMBERING, 'numbering.xml')
sname = get_name(STYLES, 'styles.xml')
numbering = Numbering()
fname = get_name(FONTS, 'fontTable.xml')
numbering = self.numbering = Numbering()
fonts = self.fonts = Fonts()
if fname is not None:
embed_relationships = self.docx.get_relationships(fname)[0]
try:
raw = self.docx.read(fname)
except KeyError:
self.log.warn('Fonts table %s does not exist' % fname)
else:
fonts(fromstring(raw), embed_relationships, self.docx, self.dest_dir)
if sname is not None:
try:
@ -110,7 +134,7 @@ class Convert(object):
except KeyError:
self.log.warn('Styles %s do not exist' % sname)
else:
self.styles(fromstring(raw))
self.styles(fromstring(raw), fonts)
if nname is not None:
try:
@ -126,17 +150,20 @@ class Convert(object):
raw = html.tostring(self.html, encoding='utf-8', doctype='<!DOCTYPE html>')
with open(os.path.join(self.dest_dir, 'index.html'), 'wb') as f:
f.write(raw)
css = self.styles.generate_css()
css = self.styles.generate_css(self.dest_dir, self.docx)
if css:
with open(os.path.join(self.dest_dir, 'docx.css'), 'wb') as f:
f.write(css.encode('utf-8'))
def convert_p(self, p):
dest = P()
self.object_map[dest] = p
style = self.styles.resolve_paragraph(p)
self.layers[p] = []
for run in XPath('descendant::w:r')(p):
span = self.convert_run(run)
dest.append(span)
self.layers[p].append(run)
m = re.match(r'heading\s+(\d+)$', style.style_name or '', re.IGNORECASE)
if m is not None:
@ -162,18 +189,14 @@ class Convert(object):
spans = []
bs = {}
for span, style in border_run:
c = style.css
style.get_border_css(bs)
style.clear_border_css()
spans.append(span)
for x in ('width', 'color', 'style'):
val = c.pop('border-%s' % x, None)
if val is not None:
bs['border-%s' % x] = val
if bs:
cls = self.styles.register(bs, 'text_border')
wrapper = self.wrap_elems(spans, SPAN())
wrapper.set('class', cls)
self.object_map[dest] = p
return dest
def wrap_elems(self, elems, wrapper):
@ -188,7 +211,7 @@ class Convert(object):
def convert_run(self, run):
ans = SPAN()
ans.run = run
self.object_map[ans] = run
text = Text(ans, 'text', [])
for child in run:
@ -224,7 +247,6 @@ class Convert(object):
ans.tag = 'sub' if style.vert_align == 'subscript' else 'sup'
if style.lang is not inherit:
ans.lang = style.lang
self.object_map[ans] = run
return ans
if __name__ == '__main__':

View File

@ -163,7 +163,8 @@ class MOBIFile(object):
ext = 'dat'
prefix = 'binary'
suffix = ''
if sig in {b'HUFF', b'CDIC', b'INDX'}: continue
if sig in {b'HUFF', b'CDIC', b'INDX'}:
continue
# TODO: Ignore CNCX records as well
if sig == b'FONT':
font = read_font_record(rec.raw)
@ -196,7 +197,6 @@ class MOBIFile(object):
vals = list(index)[:-1] + [None, None, None, None]
entry_map.append(Entry(*(vals[:12])))
indexing_data = collect_indexing_data(entry_map, list(map(len,
self.text_records)))
self.indexing_data = [DOC + '\n' +textwrap.dedent('''\

View File

@ -16,7 +16,8 @@ from calibre.ebooks.oeb.transforms.flatcss import KeyMapper
from calibre.utils.magick.draw import identify_data
MBP_NS = 'http://mobipocket.com/ns/mbp'
def MBP(name): return '{%s}%s' % (MBP_NS, name)
def MBP(name):
return '{%s}%s' % (MBP_NS, name)
MOBI_NSMAP = {None: XHTML_NS, 'mbp': MBP_NS}
@ -413,7 +414,7 @@ class MobiMLizer(object):
# img sizes in units other than px
# See #7520 for test case
try:
pixs = int(round(float(value) / \
pixs = int(round(float(value) /
(72./self.profile.dpi)))
except:
continue
@ -488,8 +489,6 @@ class MobiMLizer(object):
if elem.text:
if istate.preserve:
text = elem.text
elif len(elem) > 0 and isspace(elem.text):
text = None
else:
text = COLLAPSE.sub(' ', elem.text)
valign = style['vertical-align']

View File

@ -181,9 +181,9 @@ class BookHeader(object):
self.codec = 'cp1252' if not user_encoding else user_encoding
log.warn('Unknown codepage %d. Assuming %s' % (self.codepage,
self.codec))
# Some KF8 files have header length == 256 (generated by kindlegen
# 2.7?). See https://bugs.launchpad.net/bugs/1067310
max_header_length = 0x100
# Some KF8 files have header length == 264 (generated by kindlegen
# 2.9?). See https://bugs.launchpad.net/bugs/1179144
max_header_length = 0x108
if (ident == 'TEXTREAD' or self.length < 0xE4 or
self.length > max_header_length or

View File

@ -112,7 +112,7 @@ def update_flow_links(mobi8_reader, resource_map, log):
url_css_index_pattern = re.compile(r'''kindle:flow:([0-9|A-V]+)\?mime=text/css[^\)]*''', re.IGNORECASE)
for flow in mr.flows:
if flow is None: # 0th flow is None
if flow is None: # 0th flow is None
flows.append(flow)
continue
@ -330,7 +330,7 @@ def expand_mobi8_markup(mobi8_reader, resource_map, log):
mobi8_reader.flows = flows
# write out the parts and file flows
os.mkdir('text') # directory containing all parts
os.mkdir('text') # directory containing all parts
spine = []
for i, part in enumerate(parts):
pi = mobi8_reader.partinfo[i]

View File

@ -871,6 +871,7 @@ class Manifest(object):
orig_data = data
fname = urlunquote(self.href)
self.oeb.log.debug('Parsing', fname, '...')
self.oeb.html_preprocessor.current_href = self.href
try:
data = parse_html(data, log=self.oeb.log,
decoder=self.oeb.decode,
@ -1312,9 +1313,9 @@ class Guide(object):
('notes', __('Notes')),
('preface', __('Preface')),
('text', __('Main Text'))]
TYPES = set(t for t, _ in _TYPES_TITLES)
TYPES = set(t for t, _ in _TYPES_TITLES) # noqa
TITLES = dict(_TYPES_TITLES)
ORDER = dict((t, i) for i, (t, _) in enumerate(_TYPES_TITLES))
ORDER = dict((t, i) for i, (t, _) in enumerate(_TYPES_TITLES)) # noqa
def __init__(self, oeb, type, title, href):
self.oeb = oeb

View File

@ -7,7 +7,7 @@ __license__ = 'GPL v3'
__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import os, re
import sys, os, re
from calibre.customize.ui import available_input_formats
@ -26,17 +26,18 @@ def EbookIterator(*args, **kwargs):
from calibre.ebooks.oeb.iterator.book import EbookIterator
return EbookIterator(*args, **kwargs)
def get_preprocess_html(path_to_ebook, output):
from calibre.ebooks.conversion.preprocess import HTMLPreProcessor
iterator = EbookIterator(path_to_ebook)
iterator.__enter__(only_input_plugin=True, run_char_count=False,
read_anchor_map=False)
preprocessor = HTMLPreProcessor(None, False)
with open(output, 'wb') as out:
for path in iterator.spine:
with open(path, 'rb') as f:
html = f.read().decode('utf-8', 'replace')
html = preprocessor(html, get_preprocess_html=True)
def get_preprocess_html(path_to_ebook, output=None):
from calibre.ebooks.conversion.plumber import set_regex_wizard_callback, Plumber
from calibre.utils.logging import DevNull
from calibre.ptempfile import TemporaryDirectory
raw = {}
set_regex_wizard_callback(raw.__setitem__)
with TemporaryDirectory('_regex_wiz') as tdir:
pl = Plumber(path_to_ebook, os.path.join(tdir, 'a.epub'), DevNull(), for_regex_wizard=True)
pl.run()
items = [raw[item.href] for item in pl.oeb.spine if item.href in raw]
with (sys.stdout if output is None else open(output, 'wb')) as out:
for html in items:
out.write(html.encode('utf-8'))
out.write(b'\n\n' + b'-'*80 + b'\n\n')

View File

@ -25,7 +25,7 @@ from calibre.ebooks.oeb.transforms.cover import CoverManager
from calibre.ebooks.oeb.iterator.spine import (SpineItem, create_indexing_data)
from calibre.ebooks.oeb.iterator.bookmarks import BookmarksMixin
TITLEPAGE = CoverManager.SVG_TEMPLATE.decode('utf-8').replace(\
TITLEPAGE = CoverManager.SVG_TEMPLATE.decode('utf-8').replace(
'__ar__', 'none').replace('__viewbox__', '0 0 600 800'
).replace('__width__', '600').replace('__height__', '800')

View File

@ -44,8 +44,10 @@ META_XP = XPath('/h:html/h:head/h:meta[@http-equiv="Content-Type"]')
def merge_multiple_html_heads_and_bodies(root, log=None):
heads, bodies = xpath(root, '//h:head'), xpath(root, '//h:body')
if not (len(heads) > 1 or len(bodies) > 1): return root
for child in root: root.remove(child)
if not (len(heads) > 1 or len(bodies) > 1):
return root
for child in root:
root.remove(child)
head = root.makeelement(XHTML('head'))
body = root.makeelement(XHTML('body'))
for h in heads:
@ -88,7 +90,7 @@ def html5_parse(data, max_nesting_depth=100):
# Check that the asinine HTML 5 algorithm did not result in a tree with
# insane nesting depths
for x in data.iterdescendants():
if isinstance(x.tag, basestring) and len(x) is 0: # Leaf node
if isinstance(x.tag, basestring) and len(x) is 0: # Leaf node
depth = node_depth(x)
if depth > max_nesting_depth:
raise ValueError('html5lib resulted in a tree with nesting'
@ -228,7 +230,7 @@ def parse_html(data, log=None, decoder=None, preprocessor=None,
if idx > -1:
pre = data[:idx]
data = data[idx:]
if '<!DOCTYPE' in pre: # Handle user defined entities
if '<!DOCTYPE' in pre: # Handle user defined entities
user_entities = {}
for match in re.finditer(r'<!ENTITY\s+(\S+)\s+([^>]+)', pre):
val = match.group(2)
@ -368,8 +370,7 @@ def parse_html(data, log=None, decoder=None, preprocessor=None,
meta.getparent().remove(meta)
meta = etree.SubElement(head, XHTML('meta'),
attrib={'http-equiv': 'Content-Type'})
meta.set('content', 'text/html; charset=utf-8') # Ensure content is second
# attribute
meta.set('content', 'text/html; charset=utf-8') # Ensure content is second attribute
# Ensure has a <body/>
if not xpath(data, '/h:html/h:body'):

View File

@ -45,11 +45,15 @@ class Links(object):
href, page, rect = link
p, frag = href.partition('#')[0::2]
try:
link = ((path, p, frag or None), self.pdf.get_pageref(page).obj, Array(rect))
pref = self.pdf.get_pageref(page).obj
except IndexError:
self.log.warn('Unable to find page for link: %r, ignoring it' % link)
continue
self.links.append(link)
try:
pref = self.pdf.get_pageref(page-1).obj
except IndexError:
self.pdf.debug('Unable to find page for link: %r, ignoring it' % link)
continue
self.pdf.debug('The link %s points to non-existent page, moving it one page back' % href)
self.links.append(((path, p, frag or None), pref, Array(rect)))
def add_links(self):
for link in self.links:

View File

@ -873,7 +873,6 @@ class Application(QApplication):
v = pcache[v]
icon_map[type('')(getattr(style, 'SP_'+k))] = v
style.setProperty(u'calibre_icon_map', icon_map)
style.setProperty(u'calibre_item_view_focus', True)
self.__icon_map_memory_ = icon_map
def setup_styles(self, force_calibre_style):

View File

@ -122,7 +122,8 @@ def device_name_for_plugboards(device_class):
class DeviceManager(Thread): # {{{
def __init__(self, connected_slot, job_manager, open_feedback_slot,
open_feedback_msg, allow_connect_slot, sleep_time=2):
open_feedback_msg, allow_connect_slot,
after_callback_feedback_slot, sleep_time=2):
'''
:sleep_time: Time to sleep between device probes in secs
'''
@ -150,6 +151,7 @@ class DeviceManager(Thread): # {{{
self.ejected_devices = set([])
self.mount_connection_requests = Queue.Queue(0)
self.open_feedback_slot = open_feedback_slot
self.after_callback_feedback_slot = after_callback_feedback_slot
self.open_feedback_msg = open_feedback_msg
self._device_information = None
self.current_library_uuid = None
@ -392,6 +394,10 @@ class DeviceManager(Thread): # {{{
self.device.set_progress_reporter(job.report_progress)
self.current_job.run()
self.current_job = None
feedback = getattr(self.device, 'user_feedback_after_callback', None)
if feedback is not None:
self.device.user_feedback_after_callback = None
self.after_callback_feedback_slot(feedback)
else:
break
if do_sleep:
@ -850,7 +856,7 @@ class DeviceMixin(object): # {{{
self.device_manager = DeviceManager(FunctionDispatcher(self.device_detected),
self.job_manager, Dispatcher(self.status_bar.show_message),
Dispatcher(self.show_open_feedback),
FunctionDispatcher(self.allow_connect))
FunctionDispatcher(self.allow_connect), Dispatcher(self.after_callback_feedback))
self.device_manager.start()
self.device_manager.devices_initialized.wait()
if tweaks['auto_connect_to_folder']:
@ -862,6 +868,10 @@ class DeviceMixin(object): # {{{
name, show_copy_button=False,
override_icon=QIcon(icon))
def after_callback_feedback(self, feedback):
title, msg, det_msg = feedback
info_dialog(self, feedback['title'], feedback['msg'], det_msg=feedback['det_msg']).show()
def debug_detection(self, done):
self.debug_detection_callback = weakref.ref(done)
self.device_manager.debug_detection(FunctionDispatcher(self.debug_detection_done))
@ -1116,7 +1126,7 @@ class DeviceMixin(object): # {{{
return
dm = self.iactions['Remove Books'].delete_memory
if dm.has_key(job):
if job in dm:
paths, model = dm.pop(job)
self.device_manager.remove_books_from_metadata(paths,
self.booklists())
@ -1141,7 +1151,7 @@ class DeviceMixin(object): # {{{
def dispatch_sync_event(self, dest, delete, specific):
rows = self.library_view.selectionModel().selectedRows()
if not rows or len(rows) == 0:
error_dialog(self, _('No books'), _('No books')+' '+\
error_dialog(self, _('No books'), _('No books')+' '+
_('selected to send')).exec_()
return
@ -1160,7 +1170,7 @@ class DeviceMixin(object): # {{{
if fmts:
for f in fmts.split(','):
f = f.lower()
if format_count.has_key(f):
if f in format_count:
format_count[f] += 1
else:
format_count[f] = 1

View File

@ -139,6 +139,7 @@ class BooksView(QTableView): # {{{
def __init__(self, parent, modelcls=BooksModel, use_edit_metadata_dialog=True):
QTableView.__init__(self, parent)
self.setProperty('highlight_current_item', 150)
self.row_sizing_done = False
if not tweaks['horizontal_scrolling_per_column']:

View File

@ -13,13 +13,82 @@ from threading import Thread
from calibre import walk, prints, as_unicode
from calibre.constants import (config_dir, iswindows, isosx, plugins, DEBUG,
isworker)
isworker, filesystem_encoding)
from calibre.utils.fonts.metadata import FontMetadata, UnsupportedFont
from calibre.utils.icu import sort_key
class NoFonts(ValueError):
pass
def default_font_dirs():
return [
'/opt/share/fonts',
'/usr/share/fonts',
'/usr/local/share/fonts',
os.path.expanduser('~/.local/share/fonts'),
os.path.expanduser('~/.fonts')
]
def fc_list():
import ctypes
from ctypes.util import find_library
lib = find_library('fontconfig')
if lib is None:
return default_font_dirs()
try:
lib = ctypes.CDLL(lib)
except:
return default_font_dirs()
prototype = ctypes.CFUNCTYPE(ctypes.c_void_p, ctypes.c_void_p)
try:
get_font_dirs = prototype(('FcConfigGetFontDirs', lib))
except (AttributeError):
return default_font_dirs()
prototype = ctypes.CFUNCTYPE(ctypes.c_char_p, ctypes.c_void_p)
try:
next_dir = prototype(('FcStrListNext', lib))
except (AttributeError):
return default_font_dirs()
prototype = ctypes.CFUNCTYPE(None, ctypes.c_void_p)
try:
end = prototype(('FcStrListDone', lib))
except (AttributeError):
return default_font_dirs()
str_list = get_font_dirs(ctypes.c_void_p())
if not str_list:
return default_font_dirs()
ans = []
while True:
d = next_dir(str_list)
if not d:
break
if d:
try:
ans.append(d.decode(filesystem_encoding))
except ValueError:
return default_font_dirs
end(str_list)
if len(ans) < 3:
return default_font_dirs()
parents = []
for f in ans:
found = False
for p in parents:
if f.startswith(p):
found = True
break
if not found:
parents.append(f)
return parents
def font_dirs():
if iswindows:
winutil, err = plugins['winutil']
@ -35,12 +104,7 @@ def font_dirs():
os.path.expanduser('~/.fonts'),
os.path.expanduser('~/Library/Fonts'),
]
return [
'/opt/share/fonts',
'/usr/share/fonts',
'/usr/local/share/fonts',
os.path.expanduser('~/.fonts')
]
return fc_list()
class Scanner(Thread):
@ -133,7 +197,8 @@ class Scanner(Thread):
for family in self.find_font_families():
faces = filter(filter_faces, self.fonts_for_family(family))
if not faces: continue
if not faces:
continue
generic_family = panose_to_css_generic_family(faces[0]['panose'])
if generic_family in allowed_families or generic_family == preferred_families[0]:
return (family, faces)
@ -233,7 +298,8 @@ class Scanner(Thread):
def build_families(self):
families = defaultdict(list)
for f in self.cached_fonts.itervalues():
if not f: continue
if not f:
continue
lf = icu_lower(f['font-family'] or '')
if lf:
families[lf].append(f)

View File

@ -661,6 +661,17 @@ icu_set_default_encoding(PyObject *self, PyObject *args) {
}
// }}}
// set_default_encoding {{{
static PyObject *
icu_set_filesystem_encoding(PyObject *self, PyObject *args) {
char *encoding;
if (!PyArg_ParseTuple(args, "s:setfilesystemencoding", &encoding))
return NULL;
Py_FileSystemDefaultEncoding = strdup(encoding);
Py_RETURN_NONE;
}
// }}}
// set_default_encoding {{{
static PyObject *
icu_get_available_transliterators(PyObject *self, PyObject *args) {
@ -707,6 +718,10 @@ static PyMethodDef icu_methods[] = {
"set_default_encoding(encoding) -> Set the default encoding for the python unicode implementation."
},
{"set_filesystem_encoding", icu_set_filesystem_encoding, METH_VARARGS,
"set_filesystem_encoding(encoding) -> Set the filesystem encoding for python."
},
{"get_available_transliterators", icu_get_available_transliterators, METH_VARARGS,
"get_available_transliterators() -> Return list of available transliterators. This list is rather limited on OS X."
},

View File

@ -163,11 +163,22 @@ load_collator()
_icu_not_ok = _icu is None or _collator is None
try:
if sys.getdefaultencoding().lower() == 'ascii':
senc = sys.getdefaultencoding()
if not senc or senc.lower() == 'ascii':
_icu.set_default_encoding('utf-8')
del senc
except:
pass
try:
fenc = sys.getfilesystemencoding()
if not fenc or fenc.lower() == 'ascii':
_icu.set_filesystem_encoding('utf-8')
del fenc
except:
pass
# }}}
################# The string functions ########################################
@ -247,7 +258,7 @@ def collation_order(a):
################################################################################
def test(): # {{{
def test(): # {{{
from calibre import prints
# Data {{{
german = '''

View File

@ -3698,7 +3698,7 @@ bool Style::event(QEvent *event) {
}
return true;
} else if (e->propertyName() == QString("calibre_item_view_focus")) {
calibre_item_view_focus = property("calibre_item_view_focus").toBool();
calibre_item_view_focus = property("calibre_item_view_focus").toInt();
return true;
}
}
@ -4803,10 +4803,11 @@ void Style::drawPrimitive(PrimitiveElement element, const QStyleOption *option,
painter->setBrush(QBrush(patternCol, Qt::Dense4Pattern));
painter->setBrushOrigin(r.topLeft());
painter->setPen(Qt::NoPen);
painter->drawRect(r.left(), r.top(), r.width(), 1); // Top
painter->drawRect(r.left(), r.bottom(), r.width(), 1); // Bottom
painter->drawRect(r.left(), r.top(), 1, r.height()); // Left
painter->drawRect(r.right(), r.top(), 1, r.height()); // Right
int fwidth = (calibre_item_view_focus > 1) ? 2 : 1;
painter->drawRect(r.left(), r.top(), r.width(), fwidth); // Top
painter->drawRect(r.left(), r.bottom(), r.width(), fwidth); // Bottom
painter->drawRect(r.left(), r.top(), fwidth, r.height()); // Left
painter->drawRect(r.right(), r.top(), fwidth, r.height()); // Right
painter->restore();
}
else
@ -5249,6 +5250,14 @@ void Style::drawPrimitive(PrimitiveElement element, const QStyleOption *option,
QColor color(hasCustomBackground && hasSolidBackground
? v4Opt->backgroundBrush.color()
: palette.color(cg, QPalette::Highlight));
if (state & State_HasFocus && widget && widget->property("highlight_current_item").toBool()) {
// Added by Kovid to highlight the current cell in the book list
if (color.lightness() > 128)
color = color.darker(widget->property("highlight_current_item").toInt());
else
color = color.lighter();
}
bool square((opts.square&SQUARE_LISTVIEW_SELECTION) &&
(/*(!widget && r.height()<=40 && r.width()>=48) || */
(widget && !widget->inherits("KFilePlacesView") &&

View File

@ -355,7 +355,7 @@ class Style : public QCommonStyle
mutable QList<int> itsMdiButtons[2]; // 0=left, 1=right
mutable int itsTitlebarHeight;
QHash<int,QString> calibre_icon_map;
bool calibre_item_view_focus;
int calibre_item_view_focus;
bool is_kde_session;
// Required for Q3Header hover...