Pull from driver-dev

This commit is contained in:
Kovid Goyal 2009-05-21 07:07:01 -07:00
commit 5355ad2c32
6 changed files with 287 additions and 107 deletions

View File

@ -16,7 +16,7 @@ from calibre.ebooks.pdb.formatwriter import FormatWriter
from calibre.ebooks.oeb.base import OEB_IMAGES
from calibre.ebooks.pdb.header import PdbHeaderBuilder
from calibre.ebooks.pdb.ereader import image_name
from calibre.ebooks.pml.pmlconverter import html_to_pml
from calibre.ebooks.pml.pmlml import PMLMLizer
IDENTITY = 'PNRdPPrs'
@ -31,7 +31,7 @@ class Writer(FormatWriter):
self.log = log
def write_content(self, oeb_book, out_stream, metadata=None):
text = self._text(oeb_book.spine)
text = self._text(oeb_book)
images = self._images(oeb_book.manifest)
metadata = [self._metadata(metadata)]
@ -41,16 +41,15 @@ class Writer(FormatWriter):
lengths = [len(i) for i in sections]
pdbHeaderBuilder = PdbHeaderBuilder(IDENTITY, '')
pdbHeaderBuilder = PdbHeaderBuilder(IDENTITY, metadata[0].partition('\x00')[0])
pdbHeaderBuilder.build_header(lengths, out_stream)
for item in sections:
out_stream.write(item)
def _text(self, pages):
pml = ''
for page in pages:
pml += html_to_pml(unicode(page)).encode('cp1252')
def _text(self, oeb_book):
pmlmlizer = PMLMLizer(ignore_tables=self.opts.linearize_tables)
pml = unicode(pmlmlizer.extract_content(oeb_book, self.opts)).encode('cp1252', 'replace')
pml_pages = []
for i in range(0, (len(pml) / MAX_RECORD_SIZE) + 1):

View File

@ -12,7 +12,7 @@ from calibre.customize.conversion import OutputFormatPlugin
from calibre.ptempfile import TemporaryDirectory
from calibre.utils.zipfile import ZipFile
from calibre.ebooks.oeb.base import OEB_IMAGES
from calibre.ebooks.pml.pmlconverter import html_to_pml
from calibre.ebooks.pml.pmlml import PMLMLizer
class PMLOutput(OutputFormatPlugin):
@ -22,22 +22,16 @@ class PMLOutput(OutputFormatPlugin):
def convert(self, oeb_book, output_path, input_plugin, opts, log):
with TemporaryDirectory('_pmlz_output') as tdir:
self.process_spine(oeb_book.spine, tdir)
pmlmlizer = PMLMLizer(ignore_tables=opts.linearize_tables)
content = pmlmlizer.extract_content(oeb_book, opts)
with open(os.path.join(tdir, 'index.pml'), 'wb') as out:
out.write(content.encode('utf-8'))
self.write_images(oeb_book.manifest, tdir)
pmlz = ZipFile(output_path, 'w')
pmlz.add_dir(tdir)
def process_spine(self, spine, out_dir):
for item in spine:
html = html_to_pml(unicode(item)).encode('utf-8')
name = os.path.splitext(os.path.basename(item.href))[0] + '.pml'
path = os.path.join(out_dir, name)
with open(path, 'wb') as out:
out.write(html)
def write_images(self, manifest, out_dir):
for item in manifest:
if item.media_type in OEB_IMAGES:

View File

@ -10,7 +10,6 @@ __docformat__ = 'restructuredtext en'
import re
from calibre import entity_to_unicode
from calibre.ebooks.pdb.ereader import image_name
from calibre.ebooks.htmlsymbols import HTML_SYMBOLS
@ -67,75 +66,6 @@ PML_HTML_RULES = [
(re.compile(r'\\\\'), lambda match: '\\'),
]
HTML_PML_RULES = [
(re.compile(r'\\'), lambda match: '\\\\'),
(re.compile('(?<=[^\n])[ ]*<p.*?>'), lambda match: '\n<p>'),
(re.compile('</p>(?=^\n|^\r\n)'), lambda match: '\n'),
# Clean up HTML
(re.compile('@page.*?}'), lambda match: ''),
(re.compile('<script.*?>.*?</script>', re.DOTALL), lambda match: ''),
(re.compile('<style.*?>.*?</style>', re.DOTALL), lambda match: ''),
# Reflow paragraphs
(re.compile('<p.*?>(?P<text>.*?)</p>', re.DOTALL), lambda match: match.group('text').replace('\r\n', ' ').replace('\n', ' ')),
# HTML to PML
(re.compile('<a.*?href="#sidebar-(?P<target>.+?).*?">(?P<text>.+?)</a>'), lambda match: '\\Sd="%s"%s\\Sd' % (match.group('target'), match.group('text'))),
(re.compile('<a.*?href="#footnote-(?P<target>.+?).*?">(?P<text>.+?)</a>'), lambda match: '\\Fn="%s"%s\\Fn' % (match.group('target'), match.group('text'))),
(re.compile('<div.*?id="(?P<target>.+?).*?"></div>'), lambda match: '\\\\Q="%s"' % match.group('target')),
(re.compile('<a.*?href="(?P<target>#.+?).*?">(?P<text>)</a>', re.DOTALL), lambda match: '\\q="%s"%s\\q' % (match.group('target'), match.group('text'))),
(re.compile('<img.*?src="(?P<name>.+?)".*?>(.*?</img>)*'), lambda match: '\\m="%s"' % image_name(match.group('name')).strip('\x00')),
(re.compile('&(?P<num>#\d+);'), lambda match: entity_to_unicode(match)),
(re.compile('&(?P<num>.+);'), lambda match: entity_to_unicode(match)),
(re.compile('<small .*?>(?P<text>.+?)</small>', re.DOTALL), lambda match: '\\k%s\\k' % match.group('text')),
(re.compile('<small>(?P<text>.+?)</small>', re.DOTALL), lambda match: '\\k%s\\k' % match.group('text')),
(re.compile('<sub .*?>(?P<text>.+?)</sub>', re.DOTALL), lambda match: '\\Sb%s\\Sb' % match.group('text')),
(re.compile('<sub>(?P<text>.+?)</sub>', re.DOTALL), lambda match: '\\Sb%s\\Sb' % match.group('text')),
(re.compile('<sup .*?>(?P<text>.+?)</sup>', re.DOTALL), lambda match: '\\Sp%s\\Sp' % match.group('text')),
(re.compile('<sup>(?P<text>.+?)</sup>', re.DOTALL), lambda match: '\\Sp%s\\Sp' % match.group('text')),
(re.compile('<b .*?>(?P<text>.+?)</b>', re.DOTALL), lambda match: '\\B%s\\B' % match.group('text')),
(re.compile('<b>(?P<text>.+?)</b>', re.DOTALL), lambda match: '\\B%s\\B' % match.group('text')),
(re.compile('<strong .*?>(?P<text>.+?)</strong>', re.DOTALL), lambda match: '\\B%s\\B' % match.group('text')),
(re.compile('<strong>(?P<text>.+?)</strong>', re.DOTALL), lambda match: '\\B%s\\B' % match.group('text')),
(re.compile('<big .*?>(?P<text>.+?)</big>', re.DOTALL), lambda match: '\\l%s\\l' % match.group('text')),
(re.compile('<big>(?P<text>.+?)</big>', re.DOTALL), lambda match: '\\l%s\\l' % match.group('text')),
(re.compile('<hr.*?width="(?P<val>\d+)%".*?>'), lambda match: '\\w="%s%%"' % match.group('val')),
(re.compile('<div.*?style.*?margin-left: (?P<val>\d+)%*;.*?>(?P<text>.+?)</div>', re.MULTILINE), lambda match: '\\T="%s%%"%s$' % (match.group('val'), match.group('text'))),
(re.compile('<div.*?style.*?margin-left: \d{1,3}%;.*?>(?P<text>.+?)</div>', re.DOTALL), lambda match: '\\t%s\\t' % match.group('text')),
(re.compile('<!--(?P<text>.+?)-->', re.DOTALL), lambda match: '\\v%s\\v' % match.group('text')),
(re.compile('<del .*?>(?P<text>.+?)</del>', re.DOTALL), lambda match: '\\o%s\\o' % match.group('text')),
(re.compile('<del>(?P<text>.+?)</del>', re.DOTALL), lambda match: '\\o%s\\o' % match.group('text')),
(re.compile('<div.*?style.*?text-decoration: underline;.*?>(?P<text>.+?)</div>', re.DOTALL), lambda match: '\\u%s\\u' % match.group('text')),
(re.compile('<i .*?>(?P<text>.+?)</i>', re.DOTALL), lambda match: '\\\\i%s\\i' % match.group('text')),
(re.compile('<i>(?P<text>.+?)</i>', re.DOTALL), lambda match: '\\\\i%s\\i' % match.group('text')),
(re.compile('<div.*?style.*?text-align: right;.*?>(?P<text>.+?)</div>', re.DOTALL), lambda match: '\\r%s\\r' % match.group('text')),
(re.compile('<div.*?style.*?text-align: center;.*?".*?>(?P<text>.+?)</div>', re.DOTALL), lambda match: '\\c%s\\c' % match.group('text')),
(re.compile('<h(?P<val>[0-4]).*?>(?P<text>.+?)</h[0-4]>', re.DOTALL), lambda match: '\\X%s%s\\X%s' % (int(match.group('val')) + 1, match.group('text'), int(match.group('val')) + 1)),
(re.compile('<h1.*?>(?P<text>.+?)</h1>', re.DOTALL), lambda match: '\\x%s\\x' % match.group('text')),
(re.compile('<br .*?>'), lambda match: '\n'),
(re.compile('<br/*>'), lambda match: '\n'),
# Remove remaining HTML tags
(re.compile('<.*?>'), lambda match: ''),
# Remove redundant page break markers
(re.compile(r'(\\p){2,}'), lambda match: r'\p'),
# Remove whitespace on empty lines
(re.compile('^[\t\r ]$', re.MULTILINE), lambda match: ''),
# Remove excess whitespace in lines
(re.compile('(?<=.)[ ]{2,}(?=.)'), lambda match: ' '),
# Remove excess newlines at the beginning and end
(re.compile('^(\r\n){1,}'), lambda match: ''),
(re.compile('^\n{1,}'), lambda match: ''),
(re.compile('(\r\n){3,}$'), lambda match: ''),
(re.compile('\n{3,}$'), lambda match: ''),
]
def pml_to_html(pml):
html = pml
for rule in PML_HTML_RULES:
@ -151,15 +81,3 @@ def footnote_sidebar_to_html(id, pml):
html = '<div id="sidebar-%s"><dt>%s</dt></div><dd>%s</dd>' % (id, id, pml_to_html(pml))
return html
def html_to_pml(html):
pml = ''
for dom_tree in BeautifulSoup(html).findAll('body'):
body = unicode(dom_tree.prettify())
for rule in HTML_PML_RULES:
body = rule[0].sub(rule[1], body)
pml += body
return pml

View File

@ -0,0 +1,205 @@
# -*- coding: utf-8 -*-
__license__ = 'GPL 3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
'''
Transform OEB content into PML markup
'''
import os, re
from calibre.ebooks.oeb.base import XHTML, XHTML_NS, barename, namespace
from calibre.ebooks.oeb.stylizer import Stylizer
from calibre.ebooks.pdb.ereader import image_name
from calibre import entity_to_unicode
TAG_MAP = {
'b' : 'B',
'strong' : 'B',
'i' : 'I',
'small' : 'k',
'sub' : 'Sb',
'sup' : 'Sp',
'big' : 'l',
'del' : 'o',
'h1' : 'x',
'h2' : 'X0',
'h3' : 'x1',
'h4' : 'X2',
'h5' : 'X3',
'h6' : 'X4',
'!--' : 'v',
}
STYLES = [
('font-weight', {'bold' : 'B', 'bolder' : 'B'}),
('font-style', {'italic' : 'I'}),
('text-decoration', {'underline' : 'u'}),
('text-align', {'right' : 'r', 'center' : 'c'}),
]
class PMLMLizer(object):
def __init__(self, ignore_tables=False):
self.ignore_tables = ignore_tables
def extract_content(self, oeb_book, opts):
oeb_book.logger.info('Converting XHTML to PML markup...')
self.oeb_book = oeb_book
self.opts = opts
return self.pmlmlize_spine()
def pmlmlize_spine(self):
output = u''
for item in self.oeb_book.spine:
stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts.output_profile)
output += self.add_page_anchor(item.href)
output += self.dump_text(item.data.find(XHTML('body')), stylizer)
output = self.clean_text(output)
return output
def add_page_anchor(self, href):
href = os.path.splitext(os.path.basename(href))[0]
return '\\Q="%s"' % href
def clean_text(self, text):
# Remove excess spaces at beginning and end of lines
text = re.sub('(?m)^[ ]+', '', text)
text = re.sub('(?m)[ ]+$', '', text)
# Remove excessive newlines
text = re.sub('%s{1,1}' % os.linesep, '%s%s' % (os.linesep, os.linesep), text)
text = re.sub('%s{3,}' % os.linesep, '%s%s' % (os.linesep, os.linesep), text)
text = re.sub('[ ]{2,}', ' ', text)
# Remove excessive \p tags
text = re.sub(r'\\p\s*\\p', '', text)
# Remove anchors that do not have links
anchors = set(re.findall(r'(?<=\\Q=").+?(?=")', text))
links = set(re.findall(r'(?<=\\q="#).+?(?=")', text))
for unused in anchors.difference(links):
text = text.replace('\\Q="%s"' % unused, '')
for entity in set(re.findall('&.+?;', text)):
text = text.replace(entity, entity_to_unicode(entity[1:-1]))
return text
def dump_text(self, elem, stylizer, tag_stack=[]):
if not isinstance(elem.tag, basestring) \
or namespace(elem.tag) != XHTML_NS:
return u''
text = u''
style = stylizer.style(elem)
if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \
or style['visibility'] == 'hidden':
return u''
tag = barename(elem.tag)
tag_count = 0
# Are we in a paragraph block?
if tag == 'p' or style['display'] in ('block'):
if 'block' not in tag_stack:
tag_count += 1
tag_stack.append('block')
# Process tags that need special processing and that do not have inner
# text. Usually these require an argument
if tag == 'img':
text += '\\m="%s"' % image_name(os.path.basename(elem.get('src'))).strip('\x00')
if tag == 'hr':
text += '\\w'
width = elem.get('width')
if width:
text += '="%s%"' % width
else:
text += '="50%"'
# Process style information that needs holds a single tag
# Commented out because every page in an OEB book starts with this style
#if style['page-break-before'] == 'always':
# text += '\\p'
# Proccess tags that contain text.
if hasattr(elem, 'text') and elem.text != None and elem.text.strip() != '':
pml_tag = TAG_MAP.get(tag, None)
if pml_tag and pml_tag not in tag_stack:
tag_count += 1
text += '\\%s' % pml_tag
tag_stack.append(pml_tag)
# Special processing of tags that require an argument.
# Anchors links
if tag == 'a' and 'q' not in tag_stack:
href = elem.get('href')
if href and '://' not in href:
if '#' in href:
href = href.partition('#')[2]
href = os.path.splitext(os.path.basename(href))[0]
tag_count += 1
text += '\\q="#%s"' % href
tag_stack.append('q')
# Anchor ids
id_name = elem.get('id')
if id_name:
text += '\\Q="%s"' % os.path.splitext(id_name)[0]
# Processes style information
for s in STYLES:
style_tag = s[1].get(style[s[0]], None)
if style_tag and style_tag not in tag_stack:
tag_count += 1
text += '\\%s' % style_tag
tag_stack.append(style_tag)
# margin
text += self.elem_text(elem, tag_stack)
for item in elem:
text += self.dump_text(item, stylizer, tag_stack)
close_tag_list = []
for i in range(0, tag_count):
close_tag_list.insert(0, tag_stack.pop())
text += self.close_tags(close_tag_list)
if tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'div', 'li', 'tr'):
text += os.linesep + os.linesep
if 'block' not in tag_stack:
text += os.linesep + os.linesep
#if style['page-break-after'] == 'always':
# text += '\\p'
if hasattr(elem, 'tail') and elem.tail != None and elem.tail.strip() != '':
text += self.elem_tail(elem, tag_stack)
return text
def elem_text(self, elem, tag_stack):
return self.block_text(elem.text, 'block' in tag_stack)
def elem_tail(self, elem, tag_stack):
return self.block_text(elem.tail, 'block' in tag_stack)
def block_text(self, text, in_block):
if in_block:
text = text.replace('\n\r', ' ')
text = text.replace('\n', ' ')
text = text.replace('\r', ' ')
return text
def close_tags(self, tags):
text = u''
for i in range(0, len(tags)):
tag = tags.pop()
if tag != 'block':
text += '\\%s' % tag
return text

View File

@ -640,12 +640,33 @@ class DeviceGUI(object):
', '.join(sent_mails), 3000)
def sync_news(self):
def sync_news(self, send_ids=None, do_auto=True):
if self.device_connected:
ids = list(dynamic.get('news_to_be_synced', set([])))
ids = list(dynamic.get('news_to_be_synced', set([]))) if send_ids is None else send_ids
ids = [id for id in ids if self.library_view.model().db.has_id(id)]
files, auto = self.library_view.model().get_preferred_formats_from_ids(
ids, self.device_manager.device_class.settings().format_map)
files, _auto_ids = self.library_view.model().get_preferred_formats_from_ids(
ids, self.device_manager.device_class.settings().format_map,
exclude_auto=do_auto)
auto = []
if _auto_ids:
for id in _auto_ids:
formats = [f.lower() for f in self.library_view.model().db.formats(id, index_is_id=True).split(',')]
formats = formats if formats != None else []
if list(set(formats).intersection(available_input_formats())) != [] and list(set(self.device_manager.device_class.settings().format_map).intersection(available_output_formats())) != []:
auto.append(id)
if auto != []:
format = None
for fmt in self.device_manager.device_class.settings().format_map:
if fmt in list(set(self.device_manager.device_class.settings().format_map).intersection(set(available_output_formats()))):
format = fmt
break
if format is not None:
autos = [self.library_view.model().db.title(id, index_is_id=True) for id in auto]
autos = '\n'.join('%s'%i for i in autos)
info_dialog(self, _('No suitable formats'),
_('Auto converting the following books before uploading to '
'the device:'), det_msg=autos, show=True)
self.auto_convert_news(auto, format)
files = [f for f in files if f is not None]
if not files:
dynamic.set('news_to_be_synced', set([]))
@ -667,8 +688,10 @@ class DeviceGUI(object):
if config['upload_news_to_device'] and files:
remove = ids if \
config['delete_news_from_library_on_upload'] else []
on_card = self.location_view.model().free[0] < \
self.location_view.model().free[1]
space = { self.location_view.model().free[0] : 'main',
self.location_view.model().free[1] : 'carda',
self.location_view.model().free[2] : 'cardb' }
on_card = space.get(sorted(space.keys(), reverse=True)[0], 'main')
self.upload_books(files, names, metadata,
on_card=on_card,
memory=[[f.name for f in files], remove])

View File

@ -1069,6 +1069,24 @@ class Main(MainWindow, Ui_MainWindow, DeviceGUI):
current = self.library_view.currentIndex()
self.library_view.model().current_changed(current, previous)
def auto_convert_news(self, book_ids, format):
previous = self.library_view.currentIndex()
rows = [x.row() for x in \
self.library_view.selectionModel().selectedRows()]
jobs, changed, bad = convert_single_ebook(self, self.library_view.model().db, book_ids, True, format)
if jobs == []: return
for func, args, desc, fmt, id, temp_files in jobs:
if id not in bad:
job = self.job_manager.run_job(Dispatcher(self.book_auto_converted_news),
func, args=args, description=desc)
self.conversion_jobs[job] = (temp_files, fmt, id)
if changed:
self.library_view.model().refresh_rows(rows)
current = self.library_view.currentIndex()
self.library_view.model().current_changed(current, previous)
def get_books_for_conversion(self):
rows = [r.row() for r in \
self.library_view.selectionModel().selectedRows()]
@ -1164,6 +1182,29 @@ class Main(MainWindow, Ui_MainWindow, DeviceGUI):
self.send_by_mail(to, fmts, delete_from_library, specific_format=fmt, send_ids=[book_id], do_auto_convert=False)
def book_auto_converted_news(self, job):
temp_files, fmt, book_id = self.conversion_jobs.pop(job)
try:
if job.failed:
return self.job_exception(job)
data = open(temp_files[0].name, 'rb')
self.library_view.model().db.add_format(book_id, fmt, data, index_is_id=True)
data.close()
self.status_bar.showMessage(job.description + (' completed'), 2000)
finally:
for f in temp_files:
try:
if os.path.exists(f.name):
os.remove(f.name)
except:
pass
self.tags_view.recount()
if self.current_view() is self.library_view:
current = self.library_view.currentIndex()
self.library_view.model().current_changed(current, QModelIndex())
self.sync_news(send_ids=[book_id], do_auto_convert=False)
def book_converted(self, job):
temp_files, fmt, book_id = self.conversion_jobs.pop(job)
try: