diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py index b105a6c042..fb55ee74fb 100644 --- a/src/calibre/ebooks/conversion/preprocess.py +++ b/src/calibre/ebooks/conversion/preprocess.py @@ -32,6 +32,39 @@ def chap_head(match): return '

'+chap+'
'+title+'


' +def line_length(raw, percent): + ''' + raw is the raw text to find the line length to use for wrapping. + percentage is a decimal number, 0 - 1 which is used to determine + how far in the list of line lengths to use. + ''' + raw = raw.replace(' ', ' ') + linere = re.compile('(?<=
).*?(?=
)', re.DOTALL) + lines = linere.findall(raw) + + lengths = [] + for line in lines: + if len(line) > 0: + lengths.append(len(line)) + total = sum(lengths) + avg = total / len(lengths) + max_line = avg * 2 + + lengths = sorted(lengths) + for i in range(len(lengths) - 1, -1, -1): + if lengths[i] > max_line: + del lengths[i] + + if percent > 1: + percent = 1 + if percent < 0: + percent = 0 + + index = int(len(lengths) * percent) - 1 + + return lengths[index] + + class CSSPreProcessor(object): PAGE_PAT = re.compile(r'@page[^{]*?{[^}]*?}') @@ -129,7 +162,12 @@ class HTMLPreProcessor(object): elif self.is_book_designer(html): rules = self.BOOK_DESIGNER elif self.is_pdftohtml(html): - rules = self.PDFTOHTML + # Add rules that require matching line length here + #line_length_rules = [ + # (re.compile('%i' % line_length(html, .85)), lambda match:) + #] + + rules = self.PDFTOHTML # + line_length_rules else: rules = [] for rule in self.PREPROCESS + rules: diff --git a/src/calibre/ebooks/pdb/ereader/pmlconverter.py b/src/calibre/ebooks/pdb/ereader/pmlconverter.py index 8ff30e9349..347bde951c 100644 --- a/src/calibre/ebooks/pdb/ereader/pmlconverter.py +++ b/src/calibre/ebooks/pdb/ereader/pmlconverter.py @@ -12,10 +12,12 @@ import re from calibre.ebooks.htmlsymbols import HTML_SYMBOLS +from BeautifulSoup import BeautifulSoup + PML_HTML_RULES = [ (re.compile(r'\\p'), lambda match: '

'), (re.compile(r'\\x(?P.+?)\\x', re.DOTALL), lambda match: '

%s

' % match.group('text')), - (re.compile(r'\\X(?P[0-4])(?P.+?)\\X[0-4]', re.DOTALL), lambda match: '%s' % (int(match.group('val')) + 1, match.group('text'), int(match.group('val')) + 1)), + (re.compile(r'\\X(?P[0-4])(?P.+?)\\X[0-4]', re.DOTALL), lambda match: '%s' % (int(match.group('val')) + 1, match.group('text'), int(match.group('val')) + 1)), (re.compile(r'\\C\d=".+"'), lambda match: ''), # This should be made to create a TOC entry (re.compile(r'\\c(?P.+?)\\c', re.DOTALL), lambda match: '
%s
' % match.group('text')), (re.compile(r'\\r(?P.+?)\\r', re.DOTALL), lambda match: '
%s
' % match.group('text')), @@ -24,8 +26,8 @@ PML_HTML_RULES = [ (re.compile(r'\\o(?P.+?)\\o', re.DOTALL), lambda match: '%s' % match.group('text')), (re.compile(r'\\v(?P.+?)\\v', re.DOTALL), lambda match: '' % match.group('text')), (re.compile(r'\\t(?P.+?)\\t', re.DOTALL), lambda match: '
%s
' % match.group('text')), - (re.compile(r'\\T="(?P\d+)%%*"(?P.+?)$', re.MULTILINE), lambda match: '
%s
' % (match.group('val'), match.group('text'))), - (re.compile(r'\\w="(?P\d+)%%"'), lambda match: '
' % match.group('val')), + (re.compile(r'\\T="(?P\d+)%*"(?P.+?)$', re.MULTILINE), lambda match: r'
%s
' % (match.group('val'), match.group('text'))), + (re.compile(r'\\w="(?P\d+)%"'), lambda match: '
' % match.group('val')), (re.compile(r'\\n'), lambda match: ''), (re.compile(r'\\s'), lambda match: ''), (re.compile(r'\\b(?P.+?)\\b', re.DOTALL), lambda match: '%s' % match.group('text')), # \b is deprecated; \B should be used instead. @@ -34,10 +36,10 @@ PML_HTML_RULES = [ (re.compile(r'\\Sp(?P.+?)\\Sp', re.DOTALL), lambda match: '%s' % match.group('text')), (re.compile(r'\\Sb(?P.+?)\\Sb', re.DOTALL), lambda match: '%s' % match.group('text')), (re.compile(r'\\k(?P.+?)\\k', re.DOTALL), lambda match: '%s' % match.group('text')), - (re.compile(r'\\a(?P\d\d\d)'), lambda match: '&#%i;' % match.group('num')), - (re.compile(r'\\U(?P\d\d\d\d)'), lambda match: '&#%i;' % int(match.group('num'))), + (re.compile(r'\\a(?P\d\d\d)'), lambda match: '&#%s;' % match.group('num')), + (re.compile(r'\\U(?P\d+)'), lambda match: '%s' % unichr(int(match.group('num'), 16))), (re.compile(r'\\m="(?P.+?)"'), lambda match: '' % match.group('name')), - (re.compile(r'\\q="(?P#.+?)"(?P)\\q', re.DOTALL), lambda match: '%s' % (match.group('target'), match.group('text'))), + (re.compile(r'\\q="(?P#.+?)"(?P.+?)\\q', re.DOTALL), lambda match: '%s' % (match.group('target'), match.group('text'))), (re.compile(r'\\Q="(?P.+?)"'), lambda match: '
' % match.group('target')), (re.compile(r'\\-'), lambda match: ''), (re.compile(r'\\Fn="(?P.+?)"(?P.+?)\\Fn'), lambda match: '%s' % (match.group('target'), match.group('text'))), @@ -58,14 +60,6 @@ PML_HTML_RULES = [ (re.compile(r'\\\\'), lambda match: '\\'), ] -FOOTNOTE_HTML_RULES = [ - (re.compile('(?P.+?)', re.DOTALL), lambda match: '
%s
') -] - -SIDEBAR_HTML_RULES = [ - (re.compile('(?P.+?)', re.DOTALL), lambda match: '') -] - HTML_PML_RULES = [ (re.compile(r'\\'), lambda match: '\\\\'), (re.compile('(?<=[^\n])[ ]*'), lambda match: '\n

'), @@ -75,23 +69,23 @@ HTML_PML_RULES = [ (re.compile('.+?).*?">'), lambda match: '\\\\Q="%s"' % match.group('target')), (re.compile('#.+?).*?">(?P)', re.DOTALL), lambda match: '\\q="%s"%s\\q' % (match.group('target'), match.group('text'))), (re.compile('.+?)".*?>'), lambda match: '\\m="%s"' % match.group('name')), - (re.compile('&#(?P\d\d\d\d);'), lambda match: '\\U%i' % int(match.group('num'))), - (re.compile('&#(?P\d\d\d);'), lambda match: '\\a%i' % match.group('num')), + #(re.compile('&#(?P\d\d\d\d);'), lambda match: '\\U%s' % int(match.group('num'))), + (re.compile('&#(?P\d\d\d);'), lambda match: '\\a%s' % match.group('num')), (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\k%s\\k' % match.group('text')), (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\Sb%s\\Sb' % match.group('text')), (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\Sp%s\\Sp' % match.group('text')), (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\B%s\\B' % match.group('text')), (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\l%s\\l' % match.group('text')), - (re.compile('\d+)%%".*?>'), lambda match: '\\w="%s%%"' % match.group('val')), - (re.compile('\d+)%%*;.*?>(?P.+?)', re.MULTILINE), lambda match: '\\T="%i%%"%s$' % (match.group('val'), match.group('text'))), - (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\t%s\\t' % match.group('text')), + (re.compile('\d+)%".*?>'), lambda match: '\\w="%s%%"' % match.group('val')), + (re.compile('\d+)%*;.*?>(?P.+?)', re.MULTILINE), lambda match: '\\T="%s%%"%s$' % (match.group('val'), match.group('text'))), + (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\t%s\\t' % match.group('text')), (re.compile('', re.DOTALL), lambda match: '\\v%s\\v' % match.group('text')), (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\o%s\\o' % match.group('text')), (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\u%s\\u' % match.group('text')), (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\\\i%s\\i' % match.group('text')), (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\r%s\\r' % match.group('text')), (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\c%s\\c' % match.group('text')), - (re.compile('[0-4]).*?>(?P.+?)', re.DOTALL), lambda match: '\\X%i%s\\X%i' % (int(match.group('val')) + 1, match.group('text'), int(match.group('val')) + 1)), + (re.compile('[0-4]).*?>(?P.+?)', re.DOTALL), lambda match: '\\X%s%s\\X%s' % (int(match.group('val')) + 1, match.group('text'), int(match.group('val')) + 1)), (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\x%s\\x' % match.group('text')), (re.compile(''), lambda match: '\\p'), (re.compile('<.*?>'), lambda match: ''), @@ -109,28 +103,20 @@ def pml_to_html(pml): return html -def footnote_to_html(footnotes): - html = footnotes - for rule in FOOTNOTE_HTML_RULES: - html = rule[0].sub(rule[1], html) - - html = pml_to_html(html) - - return html - -def sidebar_to_html(sidebars): - html = sidebars - for rule in FOOTNOTE_HTML_RULES: - html = rule[0].sub(rule[1], html) - - html = pml_to_html(html) - - return html +def footnote_sidebar_to_html(id, pml): + html = '

%s
' % (id, id, pml_to_html(pml)) + return html def html_to_pml(html): - pml = html - for rule in HTML_PML_RULES: - pml = rule[0].sub(rule[1], pml) + pml = '' + + for dom_tree in BeautifulSoup(html).findAll('body'): + body = unicode(dom_tree.pretty_print()) + + for rule in HTML_PML_RULES: + body = rule[0].sub(rule[1], pml) + + pml += body # Replace symbols outside of cp1512 wtih \Uxxxx diff --git a/src/calibre/ebooks/pdb/ereader/reader.py b/src/calibre/ebooks/pdb/ereader/reader.py index 8a0abb970e..e0e42e40fd 100644 --- a/src/calibre/ebooks/pdb/ereader/reader.py +++ b/src/calibre/ebooks/pdb/ereader/reader.py @@ -8,7 +8,7 @@ __license__ = 'GPL v3' __copyright__ = '2009, John Schember ' __docformat__ = 'restructuredtext en' -import os, sys, struct, zlib +import os, re, sys, struct, zlib from calibre import CurrentDir from calibre.ebooks import DRMError @@ -16,7 +16,7 @@ from calibre.ebooks.metadata import MetaInformation from calibre.ebooks.pdb.formatreader import FormatReader from calibre.ebooks.pdb.ereader import EreaderError from calibre.ebooks.pdb.ereader.pmlconverter import pml_to_html, \ - footnote_to_html, sidebar_to_html + footnote_sidebar_to_html from calibre.ebooks.mobi.palmdoc import decompress_doc from calibre.ebooks.metadata.opf2 import OPFCreator @@ -42,14 +42,6 @@ class HeaderRecord(object): self.num_text_pages = self.non_text_offset -1 self.num_image_pages = self.metadata_offset - self.image_data_offset - - # Can't tell which is sidebar and footnote if they have same offset. - # They don't exist if offset is larget than last_record. - # Todo: Determine if the subtraction is necessary and find out - # what _rec means. - end_footnote_offset = self.sidebar_offset if self.sidebar_offset != self.footnote_offset else self.last_data_offset - self.num_footnote_pages = end_footnote_offset - self.footnote_offset if self.footnote_offset < self.last_data_offset else 0 - self.num_sidebar_pages = self.sidebar_offset - self.last_data_offset if self.footnote_offset < self.last_data_offset else 0 class Reader(FormatReader): @@ -94,44 +86,10 @@ class Reader(FormatReader): assumed to be encoded as Windows-1252. The encoding is part of the eReader file spec and should always be this encoding. ''' - if number not in range(1, self.header_record.num_text_pages): + if number not in range(1, self.header_record.num_text_pages + 1): return '' return self.decompress_text(number) - - def get_footnote_page(self, number): - if number not in range(self.header_record.footnote_offset, self.header_record.footnote_offset + self.header_record.num_footnote_pages): - return '' - - return self.decompress_text(number) - - def get_sidebar_page(self, number): - if number not in range(self.header_record.sidebar_offset, self.header_record.sidebar_offset + self.header_record.num_sidebar_pages - 1): - return '' - - return self.decompress_text(number) - - def has_footnotes(self): - if self.header_record.num_footnote_pages > 1: - try: - content = self.decompress_text(self.header_record.footnote_offset) - - if content.contains(''): - return True - except: - pass - return False - - def has_sidebar(self): - if self.header_record.num_sidebar_pages > 1: - try: - content = self.decompress_text(self.header_record.sidebar_offset) - - if content.contains(''): - return True - except: - pass - return False def extract_content(self, output_dir): output_dir = os.path.abspath(output_dir) @@ -144,22 +102,25 @@ class Reader(FormatReader): for i in range(1, self.header_record.num_text_pages + 1): self.log.debug('Extracting text page %i' % i) html += pml_to_html(self.get_text_page(i)) - - # Untested: The num_.._pages variable may not be correct! - # Possibly use .._rec instead? - ''' - if has_footnotes(): + + if self.header_record.footnote_rec > 0: html += '

%s

' % _('Footnotes') - for i in range(self.header_record.footnote_offset, self.header_record.num_footnote_pages): + footnoteids = re.findall('\w+(?=\x00)', self.section_data(self.header_record.footnote_offset).decode('cp1252' if self.encoding is None else self.encoding)) + for fid, i in enumerate(range(self.header_record.footnote_offset + 1, self.header_record.footnote_offset + self.header_record.footnote_rec)): self.log.debug('Extracting footnote page %i' % i) - html += footnote_to_html(self.get_footnote_page(i)) + html += '
' + html += footnote_sidebar_to_html(footnoteids[fid], self.decompress_text(i)) + html += '
' - if has_sidebar(): + + if self.header_record.sidebar_rec > 0: html += '

%s

' % _('Sidebar') - for i in range(self.header_record.sidebar_offset, self.header_record.num_sidebar_pages): + sidebarids = re.findall('\w+(?=\x00)', self.section_data(self.header_record.sidebar_offset).decode('cp1252' if self.encoding is None else self.encoding)) + for sid, i in enumerate(range(self.header_record.sidebar_offset + 1, self.header_record.sidebar_offset + self.header_record.sidebar_rec)): self.log.debug('Extracting sidebar page %i' % i) - html += sidebar_to_html(self.get_sidebar_page(i)) - ''' + html += '
' + html += footnote_sidebar_to_html(sidebarids[sid], self.decompress_text(i)) + html += '
' html += '' diff --git a/src/calibre/ebooks/pdb/header.py b/src/calibre/ebooks/pdb/header.py index d098a64f2b..d270c0ef71 100644 --- a/src/calibre/ebooks/pdb/header.py +++ b/src/calibre/ebooks/pdb/header.py @@ -63,12 +63,12 @@ class PdbHeaderReader(object): class PdbHeaderWriter(object): def __init__(self, identity, title): - self.identity = identity[:8] + self.identity = identity.ljust(3, '\x00')[:8] self.title = title.ljust(32, '\x00')[:32] - - def build_header(self, sections): + + def build_header(self, offsets): ''' - Sections is a list of section offsets + Offsets is a list of section offsets ''' diff --git a/src/calibre/ebooks/pdb/input.py b/src/calibre/ebooks/pdb/input.py index 31dd216ee1..24bc8a1025 100644 --- a/src/calibre/ebooks/pdb/input.py +++ b/src/calibre/ebooks/pdb/input.py @@ -8,6 +8,7 @@ __docformat__ = 'restructuredtext en' import os from calibre.customize.conversion import InputFormatPlugin +from calibre.ebooks.pdb.header import PdbHeaderReader from calibre.ebooks.pdb import PDBError, IDENTITY_TO_NAME, get_reader class PDBInput(InputFormatPlugin):