From 19ba43153b6bec69f0df754a064e565399cea62a Mon Sep 17 00:00:00 2001 From: John Schember Date: Fri, 24 Apr 2009 07:27:54 -0400 Subject: [PATCH] Line length for pdf processing --- src/calibre/ebooks/conversion/preprocess.py | 40 ++++++++++++++++++++- src/calibre/ebooks/pdb/header.py | 4 +-- src/calibre/ebooks/pdb/input.py | 2 +- 3 files changed, 42 insertions(+), 4 deletions(-) diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py index b105a6c042..fb55ee74fb 100644 --- a/src/calibre/ebooks/conversion/preprocess.py +++ b/src/calibre/ebooks/conversion/preprocess.py @@ -32,6 +32,39 @@ def chap_head(match): return '

'+chap+'
'+title+'


' +def line_length(raw, percent): + ''' + raw is the raw text to find the line length to use for wrapping. + percentage is a decimal number, 0 - 1 which is used to determine + how far in the list of line lengths to use. + ''' + raw = raw.replace(' ', ' ') + linere = re.compile('(?<=
).*?(?=
)', re.DOTALL) + lines = linere.findall(raw) + + lengths = [] + for line in lines: + if len(line) > 0: + lengths.append(len(line)) + total = sum(lengths) + avg = total / len(lengths) + max_line = avg * 2 + + lengths = sorted(lengths) + for i in range(len(lengths) - 1, -1, -1): + if lengths[i] > max_line: + del lengths[i] + + if percent > 1: + percent = 1 + if percent < 0: + percent = 0 + + index = int(len(lengths) * percent) - 1 + + return lengths[index] + + class CSSPreProcessor(object): PAGE_PAT = re.compile(r'@page[^{]*?{[^}]*?}') @@ -129,7 +162,12 @@ class HTMLPreProcessor(object): elif self.is_book_designer(html): rules = self.BOOK_DESIGNER elif self.is_pdftohtml(html): - rules = self.PDFTOHTML + # Add rules that require matching line length here + #line_length_rules = [ + # (re.compile('%i' % line_length(html, .85)), lambda match:) + #] + + rules = self.PDFTOHTML # + line_length_rules else: rules = [] for rule in self.PREPROCESS + rules: diff --git a/src/calibre/ebooks/pdb/header.py b/src/calibre/ebooks/pdb/header.py index 5b47e48a16..60ce9f15b9 100644 --- a/src/calibre/ebooks/pdb/header.py +++ b/src/calibre/ebooks/pdb/header.py @@ -63,10 +63,10 @@ class PdbHeaderReader(object): class PdbHeaderWriter(object): def __init__(self, identity, title): - self.identity = identity[:8] + self.identity = identity.ljust(3, '\x00')[:8] self.title = title.ljust(32, '\x00')[:32] - def build_header(self, sections) + def build_header(self, offsets): ''' Sections is a list of section offsets ''' diff --git a/src/calibre/ebooks/pdb/input.py b/src/calibre/ebooks/pdb/input.py index 180e0814a6..31808d27d5 100644 --- a/src/calibre/ebooks/pdb/input.py +++ b/src/calibre/ebooks/pdb/input.py @@ -8,7 +8,7 @@ __docformat__ = 'restructuredtext en' import os from calibre.customize.conversion import InputFormatPlugin -from calibre.ebooks.pdb.header import PdbHeader +from calibre.ebooks.pdb.header import PdbHeaderReader from calibre.ebooks.pdb import PDBError, IDENTITY_TO_NAME, get_reader class PDBInput(InputFormatPlugin):