Line length for pdf processing

This commit is contained in:
John Schember 2009-04-24 07:27:54 -04:00
parent d871313ff0
commit 19ba43153b
3 changed files with 42 additions and 4 deletions

View File

@ -32,6 +32,39 @@ def chap_head(match):
return '<h1>'+chap+'<br/>'+title+'</h1><br/>' return '<h1>'+chap+'<br/>'+title+'</h1><br/>'
def line_length(raw, percent):
'''
raw is the raw text to find the line length to use for wrapping.
percentage is a decimal number, 0 - 1 which is used to determine
how far in the list of line lengths to use.
'''
raw = raw.replace('&nbsp;', ' ')
linere = re.compile('(?<=<br>).*?(?=<br>)', re.DOTALL)
lines = linere.findall(raw)
lengths = []
for line in lines:
if len(line) > 0:
lengths.append(len(line))
total = sum(lengths)
avg = total / len(lengths)
max_line = avg * 2
lengths = sorted(lengths)
for i in range(len(lengths) - 1, -1, -1):
if lengths[i] > max_line:
del lengths[i]
if percent > 1:
percent = 1
if percent < 0:
percent = 0
index = int(len(lengths) * percent) - 1
return lengths[index]
class CSSPreProcessor(object): class CSSPreProcessor(object):
PAGE_PAT = re.compile(r'@page[^{]*?{[^}]*?}') PAGE_PAT = re.compile(r'@page[^{]*?{[^}]*?}')
@ -129,7 +162,12 @@ class HTMLPreProcessor(object):
elif self.is_book_designer(html): elif self.is_book_designer(html):
rules = self.BOOK_DESIGNER rules = self.BOOK_DESIGNER
elif self.is_pdftohtml(html): elif self.is_pdftohtml(html):
rules = self.PDFTOHTML # Add rules that require matching line length here
#line_length_rules = [
# (re.compile('%i' % line_length(html, .85)), lambda match:)
#]
rules = self.PDFTOHTML # + line_length_rules
else: else:
rules = [] rules = []
for rule in self.PREPROCESS + rules: for rule in self.PREPROCESS + rules:

View File

@ -63,10 +63,10 @@ class PdbHeaderReader(object):
class PdbHeaderWriter(object): class PdbHeaderWriter(object):
def __init__(self, identity, title): def __init__(self, identity, title):
self.identity = identity[:8] self.identity = identity.ljust(3, '\x00')[:8]
self.title = title.ljust(32, '\x00')[:32] self.title = title.ljust(32, '\x00')[:32]
def build_header(self, sections) def build_header(self, offsets):
''' '''
Sections is a list of section offsets Sections is a list of section offsets
''' '''

View File

@ -8,7 +8,7 @@ __docformat__ = 'restructuredtext en'
import os import os
from calibre.customize.conversion import InputFormatPlugin from calibre.customize.conversion import InputFormatPlugin
from calibre.ebooks.pdb.header import PdbHeader from calibre.ebooks.pdb.header import PdbHeaderReader
from calibre.ebooks.pdb import PDBError, IDENTITY_TO_NAME, get_reader from calibre.ebooks.pdb import PDBError, IDENTITY_TO_NAME, get_reader
class PDBInput(InputFormatPlugin): class PDBInput(InputFormatPlugin):