Add --force-page-break-before-attr

This commit is contained in:
Kovid Goyal 2007-08-11 18:47:01 +00:00
parent ba7f7278fe
commit 1f24807b87
3 changed files with 47 additions and 28 deletions

View File

@ -13,7 +13,7 @@
## with this program; if not, write to the Free Software Foundation, Inc., ## with this program; if not, write to the Free Software Foundation, Inc.,
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. ## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
''' E-book management software''' ''' E-book management software'''
__version__ = "0.3.93" __version__ = "0.3.94"
__docformat__ = "epytext" __docformat__ = "epytext"
__author__ = "Kovid Goyal <kovid@kovidgoyal.net>" __author__ = "Kovid Goyal <kovid@kovidgoyal.net>"
__appname__ = 'libprs500' __appname__ = 'libprs500'

View File

@ -153,6 +153,8 @@ def option_parser(usage):
'''if the current page has only a few elements.''') '''if the current page has only a few elements.''')
chapter.add_option('--force-page-break-before', dest='force_page_break', chapter.add_option('--force-page-break-before', dest='force_page_break',
default='$', help='Like --page-break-before, but page breaks are forced.') default='$', help='Like --page-break-before, but page breaks are forced.')
chapter.add_option('--force-page-break-before-attr', dest='force_page_break_attr',
default='$,,$', help='Force a page break before an element having the specified attribute. The format for this option is tagname regexp,attribute name,attribute value regexp. For example to match all heading tags that have the attribute class="chapter" you would use "h\d,class,chapter". Default is %default''')
prepro = parser.add_option_group('PREPROCESSING OPTIONS') prepro = parser.add_option_group('PREPROCESSING OPTIONS')
prepro.add_option('--baen', action='store_true', default=False, dest='baen', prepro.add_option('--baen', action='store_true', default=False, dest='baen',
help='''Preprocess Baen HTML files to improve generated LRF.''') help='''Preprocess Baen HTML files to improve generated LRF.''')

View File

@ -35,11 +35,11 @@ except ImportError:
from libprs500.ebooks.BeautifulSoup import BeautifulSoup, BeautifulStoneSoup, \ from libprs500.ebooks.BeautifulSoup import BeautifulSoup, BeautifulStoneSoup, \
Comment, Tag, NavigableString, Declaration, ProcessingInstruction Comment, Tag, NavigableString, Declaration, ProcessingInstruction
from libprs500.ebooks.lrf.pylrs.pylrs import Paragraph, CR, Italic, ImageStream, \ from libprs500.ebooks.lrf.pylrs.pylrs import Paragraph, CR, Italic, ImageStream, \
TextBlock, ImageBlock, JumpButton, CharButton, Bold,\ TextBlock, ImageBlock, JumpButton, CharButton, \
Plot, Image, BlockSpace, RuledLine, BookSetting, Canvas, DropCaps, \ Plot, Image, BlockSpace, RuledLine, BookSetting, Canvas, DropCaps, \
LrsError LrsError
from libprs500.ebooks.lrf.pylrs.pylrs import Span as _Span from libprs500.ebooks.lrf.pylrs.pylrs import Span as _Span
from libprs500.ebooks.lrf import Book, PRS500_PROFILE from libprs500.ebooks.lrf import Book
from libprs500.ebooks.lrf import option_parser as lrf_option_parser from libprs500.ebooks.lrf import option_parser as lrf_option_parser
from libprs500.ebooks import ConversionError from libprs500.ebooks import ConversionError
from libprs500.ebooks.lrf.html.table import Table from libprs500.ebooks.lrf.html.table import Table
@ -262,7 +262,7 @@ class HTMLConverter(object):
SELECTOR_PAT = re.compile(r"([A-Za-z0-9\-\_\:\.]+[A-Za-z0-9\-\_\:\.\s\,]*)\s*\{([^\}]*)\}") SELECTOR_PAT = re.compile(r"([A-Za-z0-9\-\_\:\.]+[A-Za-z0-9\-\_\:\.\s\,]*)\s*\{([^\}]*)\}")
PAGE_BREAK_PAT = re.compile(r'page-break-(?:after|before)\s*:\s*(\w+)', re.IGNORECASE) PAGE_BREAK_PAT = re.compile(r'page-break-(?:after|before)\s*:\s*(\w+)', re.IGNORECASE)
IGNORED_TAGS = (Comment, Declaration, ProcessingInstruction) IGNORED_TAGS = (Comment, Declaration, ProcessingInstruction)
# Fix <a /> elements
MARKUP_MASSAGE = [ MARKUP_MASSAGE = [
# Close <a /> tags # Close <a /> tags
(re.compile("(<a\s+.*?)/>|<a/>", re.IGNORECASE), (re.compile("(<a\s+.*?)/>|<a/>", re.IGNORECASE),
@ -926,6 +926,29 @@ class HTMLConverter(object):
blockwidth=pwidth, blockheight=pheight), blockwidth=pwidth, blockheight=pheight),
left, 0) left, 0)
def process_page_breaks(self, tag, tagname, tag_css):
if 'page-break-before' in tag_css.keys():
if tag_css['page-break-before'].lower() != 'avoid':
self.end_page()
tag_css.pop('page-break-before')
end_page = False
if 'page-break-after' in tag_css.keys() and \
tag_css['page-break-after'].lower() != 'avoid':
end_page = True
tag_css.pop('page-break-after')
if (self.force_page_break_attr[0].match(tagname) and \
tag.has_key(self.force_page_break_attr[1]) and \
self.force_page_break_attr[2].match(tag[self.force_page_break_attr[1]])) or \
self.force_page_break.match(tagname):
self.end_page()
self.page_break_found = True
if not self.page_break_found and self.page_break.match(tagname):
if len(self.current_page.contents) > 3:
self.end_page()
if self.verbose:
print 'Forcing page break at', tagname
return end_page
def parse_tag(self, tag, parent_css): def parse_tag(self, tag, parent_css):
try: try:
tagname = tag.name.lower() tagname = tag.name.lower()
@ -940,23 +963,8 @@ class HTMLConverter(object):
return return
except KeyError: except KeyError:
pass pass
if 'page-break-before' in tag_css.keys(): end_page = self.process_page_breaks(tag, tagname, tag_css)
if tag_css['page-break-before'].lower() != 'avoid':
self.end_page()
tag_css.pop('page-break-before')
end_page = False
if 'page-break-after' in tag_css.keys() and \
tag_css['page-break-after'].lower() != 'avoid':
end_page = True
tag_css.pop('page-break-after')
if self.force_page_break.match(tagname):
self.end_page()
self.page_break_found = True
if not self.page_break_found and self.page_break.match(tagname):
if len(self.current_page.contents) > 3:
self.end_page()
if self.verbose:
print 'Forcing page break at', tagname
if tagname in ["title", "script", "meta", 'del', 'frameset']: if tagname in ["title", "script", "meta", 'del', 'frameset']:
pass pass
elif tagname == 'a' and self.link_levels >= 0: elif tagname == 'a' and self.link_levels >= 0:
@ -991,13 +999,17 @@ class HTMLConverter(object):
previous = self.current_block previous = self.current_block
self.process_children(tag, tag_css) self.process_children(tag, tag_css)
target = None target = None
if self.current_block == previous: if self.current_block == previous:
self.current_para.append_to(self.current_block) if self.current_para.has_text():
self.current_para = Paragraph() self.current_para.append_to(self.current_block)
if self.current_block.has_text(): self.current_para = Paragraph()
target = self.current_block target = self.current_block
else: else: # Empty <a> element
target = BlockSpace() self.current_page.append(self.current_block)
self.current_block = self.book.create_text_block(
textStyle=self.current_block.textStyle,
blockStyle=self.current_block.blockStyle)
target = self.book.create_text_block()
self.current_page.append(target) self.current_page.append(target)
else: else:
found = False found = False
@ -1330,6 +1342,11 @@ def process_file(path, options):
options.link_exclude = le options.link_exclude = le
options.page_break = pb options.page_break = pb
options.chapter_regex = re.compile(options.chapter_regex, re.IGNORECASE) options.chapter_regex = re.compile(options.chapter_regex, re.IGNORECASE)
fpba = options.force_page_break_attr.split(',')
if len(fpba) != 3:
fpba = ['$', '', '$']
options.force_page_break_attr = [re.compile(fpba[0], re.IGNORECASE), fpba[1],
re.compile(fpba[2], re.IGNORECASE)]
conv = HTMLConverter(book, fonts, path, options) conv = HTMLConverter(book, fonts, path, options)
conv.process_links() conv.process_links()
oname = options.output oname = options.output