mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Pull from driver-dev
This commit is contained in:
commit
316e55244a
@ -32,6 +32,39 @@ def chap_head(match):
|
||||
return '<h1>'+chap+'<br/>'+title+'</h1><br/>'
|
||||
|
||||
|
||||
def line_length(raw, percent):
|
||||
'''
|
||||
raw is the raw text to find the line length to use for wrapping.
|
||||
percentage is a decimal number, 0 - 1 which is used to determine
|
||||
how far in the list of line lengths to use.
|
||||
'''
|
||||
raw = raw.replace(' ', ' ')
|
||||
linere = re.compile('(?<=<br>).*?(?=<br>)', re.DOTALL)
|
||||
lines = linere.findall(raw)
|
||||
|
||||
lengths = []
|
||||
for line in lines:
|
||||
if len(line) > 0:
|
||||
lengths.append(len(line))
|
||||
total = sum(lengths)
|
||||
avg = total / len(lengths)
|
||||
max_line = avg * 2
|
||||
|
||||
lengths = sorted(lengths)
|
||||
for i in range(len(lengths) - 1, -1, -1):
|
||||
if lengths[i] > max_line:
|
||||
del lengths[i]
|
||||
|
||||
if percent > 1:
|
||||
percent = 1
|
||||
if percent < 0:
|
||||
percent = 0
|
||||
|
||||
index = int(len(lengths) * percent) - 1
|
||||
|
||||
return lengths[index]
|
||||
|
||||
|
||||
class CSSPreProcessor(object):
|
||||
|
||||
PAGE_PAT = re.compile(r'@page[^{]*?{[^}]*?}')
|
||||
@ -129,7 +162,12 @@ class HTMLPreProcessor(object):
|
||||
elif self.is_book_designer(html):
|
||||
rules = self.BOOK_DESIGNER
|
||||
elif self.is_pdftohtml(html):
|
||||
rules = self.PDFTOHTML
|
||||
# Add rules that require matching line length here
|
||||
#line_length_rules = [
|
||||
# (re.compile('%i' % line_length(html, .85)), lambda match:)
|
||||
#]
|
||||
|
||||
rules = self.PDFTOHTML # + line_length_rules
|
||||
else:
|
||||
rules = []
|
||||
for rule in self.PREPROCESS + rules:
|
||||
|
@ -12,10 +12,12 @@ import re
|
||||
|
||||
from calibre.ebooks.htmlsymbols import HTML_SYMBOLS
|
||||
|
||||
from BeautifulSoup import BeautifulSoup
|
||||
|
||||
PML_HTML_RULES = [
|
||||
(re.compile(r'\\p'), lambda match: '<br /><br style="page-break-after: always;" />'),
|
||||
(re.compile(r'\\x(?P<text>.+?)\\x', re.DOTALL), lambda match: '<h1 style="page-break-before: always;">%s</h1>' % match.group('text')),
|
||||
(re.compile(r'\\X(?P<val>[0-4])(?P<text>.+?)\\X[0-4]', re.DOTALL), lambda match: '<h%i style="page-break-before: always;">%s</h%i>' % (int(match.group('val')) + 1, match.group('text'), int(match.group('val')) + 1)),
|
||||
(re.compile(r'\\X(?P<val>[0-4])(?P<text>.+?)\\X[0-4]', re.DOTALL), lambda match: '<h%s style="page-break-before: always;">%s</h%s>' % (int(match.group('val')) + 1, match.group('text'), int(match.group('val')) + 1)),
|
||||
(re.compile(r'\\C\d=".+"'), lambda match: ''), # This should be made to create a TOC entry
|
||||
(re.compile(r'\\c(?P<text>.+?)\\c', re.DOTALL), lambda match: '<div style="text-align: center; display: block; margin: auto;">%s</div>' % match.group('text')),
|
||||
(re.compile(r'\\r(?P<text>.+?)\\r', re.DOTALL), lambda match: '<div style="text-align: right; display: block;">%s</div>' % match.group('text')),
|
||||
@ -24,8 +26,8 @@ PML_HTML_RULES = [
|
||||
(re.compile(r'\\o(?P<text>.+?)\\o', re.DOTALL), lambda match: '<del>%s</del>' % match.group('text')),
|
||||
(re.compile(r'\\v(?P<text>.+?)\\v', re.DOTALL), lambda match: '<!-- %s -->' % match.group('text')),
|
||||
(re.compile(r'\\t(?P<text>.+?)\\t', re.DOTALL), lambda match: '<div style="margin-left: 5%%;">%s</div>' % match.group('text')),
|
||||
(re.compile(r'\\T="(?P<val>\d+)%%*"(?P<text>.+?)$', re.MULTILINE), lambda match: '<div style="margin-left: %i%%;">%s</div>' % (match.group('val'), match.group('text'))),
|
||||
(re.compile(r'\\w="(?P<val>\d+)%%"'), lambda match: '<hr width="%s%%" />' % match.group('val')),
|
||||
(re.compile(r'\\T="(?P<val>\d+)%*"(?P<text>.+?)$', re.MULTILINE), lambda match: r'<div style="margin-left: %s%%;">%s</div>' % (match.group('val'), match.group('text'))),
|
||||
(re.compile(r'\\w="(?P<val>\d+)%"'), lambda match: '<hr width="%s%%" />' % match.group('val')),
|
||||
(re.compile(r'\\n'), lambda match: ''),
|
||||
(re.compile(r'\\s'), lambda match: ''),
|
||||
(re.compile(r'\\b(?P<text>.+?)\\b', re.DOTALL), lambda match: '<b>%s</b>' % match.group('text')), # \b is deprecated; \B should be used instead.
|
||||
@ -34,10 +36,10 @@ PML_HTML_RULES = [
|
||||
(re.compile(r'\\Sp(?P<text>.+?)\\Sp', re.DOTALL), lambda match: '<sup>%s</sup>' % match.group('text')),
|
||||
(re.compile(r'\\Sb(?P<text>.+?)\\Sb', re.DOTALL), lambda match: '<sub>%s</sub>' % match.group('text')),
|
||||
(re.compile(r'\\k(?P<text>.+?)\\k', re.DOTALL), lambda match: '<small>%s</small>' % match.group('text')),
|
||||
(re.compile(r'\\a(?P<num>\d\d\d)'), lambda match: '&#%i;' % match.group('num')),
|
||||
(re.compile(r'\\U(?P<num>\d\d\d\d)'), lambda match: '&#%i;' % int(match.group('num'))),
|
||||
(re.compile(r'\\a(?P<num>\d\d\d)'), lambda match: '&#%s;' % match.group('num')),
|
||||
(re.compile(r'\\U(?P<num>\d+)'), lambda match: '%s' % unichr(int(match.group('num'), 16))),
|
||||
(re.compile(r'\\m="(?P<name>.+?)"'), lambda match: '<img src="images/%s" />' % match.group('name')),
|
||||
(re.compile(r'\\q="(?P<target>#.+?)"(?P<text>)\\q', re.DOTALL), lambda match: '<a href="%s">%s</a>' % (match.group('target'), match.group('text'))),
|
||||
(re.compile(r'\\q="(?P<target>#.+?)"(?P<text>.+?)\\q', re.DOTALL), lambda match: '<a href="%s">%s</a>' % (match.group('target'), match.group('text'))),
|
||||
(re.compile(r'\\Q="(?P<target>.+?)"'), lambda match: '<div id="%s"></div>' % match.group('target')),
|
||||
(re.compile(r'\\-'), lambda match: ''),
|
||||
(re.compile(r'\\Fn="(?P<target>.+?)"(?P<text>.+?)\\Fn'), lambda match: '<a href="#footnote-%s">%s</a>' % (match.group('target'), match.group('text'))),
|
||||
@ -58,14 +60,6 @@ PML_HTML_RULES = [
|
||||
(re.compile(r'\\\\'), lambda match: '\\'),
|
||||
]
|
||||
|
||||
FOOTNOTE_HTML_RULES = [
|
||||
(re.compile('<footnote id="(?P<id>.+?)">(?P<text>.+?)</footnote>', re.DOTALL), lambda match: '<div id="footnote-%s">%s</div>')
|
||||
]
|
||||
|
||||
SIDEBAR_HTML_RULES = [
|
||||
(re.compile('<sidebar id="(?P<id>.+?)">(?P<text>.+?)</sidebar>', re.DOTALL), lambda match: '<div id="sidebar-%s">%s</div>')
|
||||
]
|
||||
|
||||
HTML_PML_RULES = [
|
||||
(re.compile(r'\\'), lambda match: '\\\\'),
|
||||
(re.compile('(?<=[^\n])[ ]*<p.*?>'), lambda match: '\n<p>'),
|
||||
@ -75,23 +69,23 @@ HTML_PML_RULES = [
|
||||
(re.compile('<div.*?id="(?P<target>.+?).*?"></div>'), lambda match: '\\\\Q="%s"' % match.group('target')),
|
||||
(re.compile('<a.*?href="(?P<target>#.+?).*?">(?P<text>)</a>', re.DOTALL), lambda match: '\\q="%s"%s\\q' % (match.group('target'), match.group('text'))),
|
||||
(re.compile('<img.*?src="images/(?P<name>.+?)".*?>'), lambda match: '\\m="%s"' % match.group('name')),
|
||||
(re.compile('&#(?P<num>\d\d\d\d);'), lambda match: '\\U%i' % int(match.group('num'))),
|
||||
(re.compile('&#(?P<num>\d\d\d);'), lambda match: '\\a%i' % match.group('num')),
|
||||
#(re.compile('&#(?P<num>\d\d\d\d);'), lambda match: '\\U%s' % int(match.group('num'))),
|
||||
(re.compile('&#(?P<num>\d\d\d);'), lambda match: '\\a%s' % match.group('num')),
|
||||
(re.compile('<small.*?>(?P<text>.+?)</small>', re.DOTALL), lambda match: '\\k%s\\k' % match.group('text')),
|
||||
(re.compile('<sub.*?>(?P<text>.+?)</sub>', re.DOTALL), lambda match: '\\Sb%s\\Sb' % match.group('text')),
|
||||
(re.compile('<sup.*?>(?P<text>.+?)</sup>', re.DOTALL), lambda match: '\\Sp%s\\Sp' % match.group('text')),
|
||||
(re.compile('<b.*?>(?P<text>.+?)</b>', re.DOTALL), lambda match: '\\B%s\\B' % match.group('text')),
|
||||
(re.compile('<big.*?>(?P<text>.+?)</big>', re.DOTALL), lambda match: '\\l%s\\l' % match.group('text')),
|
||||
(re.compile('<hr.*?width="(?P<val>\d+)%%".*?>'), lambda match: '\\w="%s%%"' % match.group('val')),
|
||||
(re.compile('<div.*?style.*?margin-left: (?P<val>\d+)%%*;.*?>(?P<text>.+?)</div>', re.MULTILINE), lambda match: '\\T="%i%%"%s$' % (match.group('val'), match.group('text'))),
|
||||
(re.compile('<div.*?style.*?margin-left: \d{1,3}%%;.*?>(?P<text>.+?)</div>', re.DOTALL), lambda match: '\\t%s\\t' % match.group('text')),
|
||||
(re.compile('<hr.*?width="(?P<val>\d+)%".*?>'), lambda match: '\\w="%s%%"' % match.group('val')),
|
||||
(re.compile('<div.*?style.*?margin-left: (?P<val>\d+)%*;.*?>(?P<text>.+?)</div>', re.MULTILINE), lambda match: '\\T="%s%%"%s$' % (match.group('val'), match.group('text'))),
|
||||
(re.compile('<div.*?style.*?margin-left: \d{1,3}%;.*?>(?P<text>.+?)</div>', re.DOTALL), lambda match: '\\t%s\\t' % match.group('text')),
|
||||
(re.compile('<!-- (?P<text>.+?) -->', re.DOTALL), lambda match: '\\v%s\\v' % match.group('text')),
|
||||
(re.compile('<del.*?>(?P<text>.+?)</del>', re.DOTALL), lambda match: '\\o%s\\o' % match.group('text')),
|
||||
(re.compile('<div.*?style.*?text-decoration: underline;.*?>(?P<text>.+?)</div>', re.DOTALL), lambda match: '\\u%s\\u' % match.group('text')),
|
||||
(re.compile('<i.*?>(?P<text>.+?)</i>', re.DOTALL), lambda match: '\\\\i%s\\i' % match.group('text')),
|
||||
(re.compile('<div.*?style.*?text-align: right;.*?>(?P<text>.+?)</div>', re.DOTALL), lambda match: '\\r%s\\r' % match.group('text')),
|
||||
(re.compile('<div.*?style.*?text-align: center;.*?".*?>(?P<text>.+?)</div>', re.DOTALL), lambda match: '\\c%s\\c' % match.group('text')),
|
||||
(re.compile('<h(?P<val>[0-4]).*?>(?P<text>.+?)</h[0-4]>', re.DOTALL), lambda match: '\\X%i%s\\X%i' % (int(match.group('val')) + 1, match.group('text'), int(match.group('val')) + 1)),
|
||||
(re.compile('<h(?P<val>[0-4]).*?>(?P<text>.+?)</h[0-4]>', re.DOTALL), lambda match: '\\X%s%s\\X%s' % (int(match.group('val')) + 1, match.group('text'), int(match.group('val')) + 1)),
|
||||
(re.compile('<h1.*?>(?P<text>.+?)</h1>', re.DOTALL), lambda match: '\\x%s\\x' % match.group('text')),
|
||||
(re.compile('<br.*?>'), lambda match: '\\p'),
|
||||
(re.compile('<.*?>'), lambda match: ''),
|
||||
@ -109,28 +103,20 @@ def pml_to_html(pml):
|
||||
|
||||
return html
|
||||
|
||||
def footnote_to_html(footnotes):
|
||||
html = footnotes
|
||||
for rule in FOOTNOTE_HTML_RULES:
|
||||
html = rule[0].sub(rule[1], html)
|
||||
|
||||
html = pml_to_html(html)
|
||||
|
||||
return html
|
||||
|
||||
def sidebar_to_html(sidebars):
|
||||
html = sidebars
|
||||
for rule in FOOTNOTE_HTML_RULES:
|
||||
html = rule[0].sub(rule[1], html)
|
||||
|
||||
html = pml_to_html(html)
|
||||
|
||||
return html
|
||||
def footnote_sidebar_to_html(id, pml):
|
||||
html = '<div id="sidebar-%s"><dt>%s</dt></div><dd>%s</dd>' % (id, id, pml_to_html(pml))
|
||||
return html
|
||||
|
||||
def html_to_pml(html):
|
||||
pml = html
|
||||
for rule in HTML_PML_RULES:
|
||||
pml = rule[0].sub(rule[1], pml)
|
||||
pml = ''
|
||||
|
||||
for dom_tree in BeautifulSoup(html).findAll('body'):
|
||||
body = unicode(dom_tree.pretty_print())
|
||||
|
||||
for rule in HTML_PML_RULES:
|
||||
body = rule[0].sub(rule[1], pml)
|
||||
|
||||
pml += body
|
||||
|
||||
# Replace symbols outside of cp1512 wtih \Uxxxx
|
||||
|
||||
|
@ -8,7 +8,7 @@ __license__ = 'GPL v3'
|
||||
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import os, sys, struct, zlib
|
||||
import os, re, sys, struct, zlib
|
||||
|
||||
from calibre import CurrentDir
|
||||
from calibre.ebooks import DRMError
|
||||
@ -16,7 +16,7 @@ from calibre.ebooks.metadata import MetaInformation
|
||||
from calibre.ebooks.pdb.formatreader import FormatReader
|
||||
from calibre.ebooks.pdb.ereader import EreaderError
|
||||
from calibre.ebooks.pdb.ereader.pmlconverter import pml_to_html, \
|
||||
footnote_to_html, sidebar_to_html
|
||||
footnote_sidebar_to_html
|
||||
from calibre.ebooks.mobi.palmdoc import decompress_doc
|
||||
from calibre.ebooks.metadata.opf2 import OPFCreator
|
||||
|
||||
@ -42,14 +42,6 @@ class HeaderRecord(object):
|
||||
|
||||
self.num_text_pages = self.non_text_offset -1
|
||||
self.num_image_pages = self.metadata_offset - self.image_data_offset
|
||||
|
||||
# Can't tell which is sidebar and footnote if they have same offset.
|
||||
# They don't exist if offset is larget than last_record.
|
||||
# Todo: Determine if the subtraction is necessary and find out
|
||||
# what _rec means.
|
||||
end_footnote_offset = self.sidebar_offset if self.sidebar_offset != self.footnote_offset else self.last_data_offset
|
||||
self.num_footnote_pages = end_footnote_offset - self.footnote_offset if self.footnote_offset < self.last_data_offset else 0
|
||||
self.num_sidebar_pages = self.sidebar_offset - self.last_data_offset if self.footnote_offset < self.last_data_offset else 0
|
||||
|
||||
|
||||
class Reader(FormatReader):
|
||||
@ -94,44 +86,10 @@ class Reader(FormatReader):
|
||||
assumed to be encoded as Windows-1252. The encoding is part of
|
||||
the eReader file spec and should always be this encoding.
|
||||
'''
|
||||
if number not in range(1, self.header_record.num_text_pages):
|
||||
if number not in range(1, self.header_record.num_text_pages + 1):
|
||||
return ''
|
||||
|
||||
return self.decompress_text(number)
|
||||
|
||||
def get_footnote_page(self, number):
|
||||
if number not in range(self.header_record.footnote_offset, self.header_record.footnote_offset + self.header_record.num_footnote_pages):
|
||||
return ''
|
||||
|
||||
return self.decompress_text(number)
|
||||
|
||||
def get_sidebar_page(self, number):
|
||||
if number not in range(self.header_record.sidebar_offset, self.header_record.sidebar_offset + self.header_record.num_sidebar_pages - 1):
|
||||
return ''
|
||||
|
||||
return self.decompress_text(number)
|
||||
|
||||
def has_footnotes(self):
|
||||
if self.header_record.num_footnote_pages > 1:
|
||||
try:
|
||||
content = self.decompress_text(self.header_record.footnote_offset)
|
||||
|
||||
if content.contains('</footnote>'):
|
||||
return True
|
||||
except:
|
||||
pass
|
||||
return False
|
||||
|
||||
def has_sidebar(self):
|
||||
if self.header_record.num_sidebar_pages > 1:
|
||||
try:
|
||||
content = self.decompress_text(self.header_record.sidebar_offset)
|
||||
|
||||
if content.contains('</sidebar>'):
|
||||
return True
|
||||
except:
|
||||
pass
|
||||
return False
|
||||
|
||||
def extract_content(self, output_dir):
|
||||
output_dir = os.path.abspath(output_dir)
|
||||
@ -144,22 +102,25 @@ class Reader(FormatReader):
|
||||
for i in range(1, self.header_record.num_text_pages + 1):
|
||||
self.log.debug('Extracting text page %i' % i)
|
||||
html += pml_to_html(self.get_text_page(i))
|
||||
|
||||
# Untested: The num_.._pages variable may not be correct!
|
||||
# Possibly use .._rec instead?
|
||||
'''
|
||||
if has_footnotes():
|
||||
|
||||
if self.header_record.footnote_rec > 0:
|
||||
html += '<br /><h1>%s</h1>' % _('Footnotes')
|
||||
for i in range(self.header_record.footnote_offset, self.header_record.num_footnote_pages):
|
||||
footnoteids = re.findall('\w+(?=\x00)', self.section_data(self.header_record.footnote_offset).decode('cp1252' if self.encoding is None else self.encoding))
|
||||
for fid, i in enumerate(range(self.header_record.footnote_offset + 1, self.header_record.footnote_offset + self.header_record.footnote_rec)):
|
||||
self.log.debug('Extracting footnote page %i' % i)
|
||||
html += footnote_to_html(self.get_footnote_page(i))
|
||||
html += '<dl>'
|
||||
html += footnote_sidebar_to_html(footnoteids[fid], self.decompress_text(i))
|
||||
html += '</dl>'
|
||||
|
||||
if has_sidebar():
|
||||
|
||||
if self.header_record.sidebar_rec > 0:
|
||||
html += '<br /><h1>%s</h1>' % _('Sidebar')
|
||||
for i in range(self.header_record.sidebar_offset, self.header_record.num_sidebar_pages):
|
||||
sidebarids = re.findall('\w+(?=\x00)', self.section_data(self.header_record.sidebar_offset).decode('cp1252' if self.encoding is None else self.encoding))
|
||||
for sid, i in enumerate(range(self.header_record.sidebar_offset + 1, self.header_record.sidebar_offset + self.header_record.sidebar_rec)):
|
||||
self.log.debug('Extracting sidebar page %i' % i)
|
||||
html += sidebar_to_html(self.get_sidebar_page(i))
|
||||
'''
|
||||
html += '<dl>'
|
||||
html += footnote_sidebar_to_html(sidebarids[sid], self.decompress_text(i))
|
||||
html += '</dl>'
|
||||
|
||||
html += '</body></html>'
|
||||
|
||||
|
@ -63,12 +63,12 @@ class PdbHeaderReader(object):
|
||||
class PdbHeaderWriter(object):
|
||||
|
||||
def __init__(self, identity, title):
|
||||
self.identity = identity[:8]
|
||||
self.identity = identity.ljust(3, '\x00')[:8]
|
||||
self.title = title.ljust(32, '\x00')[:32]
|
||||
|
||||
def build_header(self, sections):
|
||||
|
||||
def build_header(self, offsets):
|
||||
'''
|
||||
Sections is a list of section offsets
|
||||
Offsets is a list of section offsets
|
||||
'''
|
||||
|
||||
|
||||
|
@ -8,6 +8,7 @@ __docformat__ = 'restructuredtext en'
|
||||
import os
|
||||
|
||||
from calibre.customize.conversion import InputFormatPlugin
|
||||
from calibre.ebooks.pdb.header import PdbHeaderReader
|
||||
from calibre.ebooks.pdb import PDBError, IDENTITY_TO_NAME, get_reader
|
||||
|
||||
class PDBInput(InputFormatPlugin):
|
||||
|
Loading…
x
Reference in New Issue
Block a user