mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-08-07 09:01:38 -04:00
Added automatic chapter detection. Prevent creation of redundant TextStyle and BlockStyle elements.
This commit is contained in:
parent
8b1800f8dc
commit
b49617d502
@ -33,7 +33,7 @@ You may have to adjust the GROUP and the location of the rules file to
|
|||||||
suit your distribution.
|
suit your distribution.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
__version__ = "0.3.22"
|
__version__ = "0.3.23"
|
||||||
__docformat__ = "epytext"
|
__docformat__ = "epytext"
|
||||||
__author__ = "Kovid Goyal <kovid@kovidgoyal.net>"
|
__author__ = "Kovid Goyal <kovid@kovidgoyal.net>"
|
||||||
|
|
||||||
|
@ -252,7 +252,8 @@ class HTMLConverter(object):
|
|||||||
def __init__(self, book, path, dpi=166, width=575, height=747,
|
def __init__(self, book, path, dpi=166, width=575, height=747,
|
||||||
font_delta=0, verbose=False, cover=None,
|
font_delta=0, verbose=False, cover=None,
|
||||||
max_link_levels=sys.maxint, link_level=0,
|
max_link_levels=sys.maxint, link_level=0,
|
||||||
is_root=True, baen=False):
|
is_root=True, baen=False, chapter_detection=True,
|
||||||
|
chapter_regex=re.compile('chapter|book|appendix', re.IGNORECASE)):
|
||||||
'''
|
'''
|
||||||
Convert HTML file at C{path} and add it to C{book}. After creating
|
Convert HTML file at C{path} and add it to C{book}. After creating
|
||||||
the object, you must call L{self.process_links} on it to create the links and
|
the object, you must call L{self.process_links} on it to create the links and
|
||||||
@ -278,16 +279,24 @@ class HTMLConverter(object):
|
|||||||
@type link_level: C{int}
|
@type link_level: C{int}
|
||||||
@param is_root: True iff this object is converting the root HTML file
|
@param is_root: True iff this object is converting the root HTML file
|
||||||
@type is_root: C{bool}
|
@type is_root: C{bool}
|
||||||
|
@param chapter_detection: Insert page breaks before what looks like
|
||||||
|
the start of a chapter
|
||||||
|
@type chapter_detection: C{bool}
|
||||||
|
@param chapter_regex: The compiled regular expression used to search for chapter titles
|
||||||
'''
|
'''
|
||||||
self.page_width = width #: The width of the page
|
self.page_width = width #: The width of the page
|
||||||
self.page_height = height #: The height of the page
|
self.page_height = height #: The height of the page
|
||||||
self.dpi = dpi #: The DPI of the intended display device
|
self.dpi = dpi #: The DPI of the intended display device
|
||||||
|
self.chapter_detection = chapter_detection #: Flag to toggle chapter detection
|
||||||
|
self.chapter_regex = chapter_regex #: Regex used to search for chapter titles
|
||||||
self.scaled_images = {} #: Temporary files with scaled version of images
|
self.scaled_images = {} #: Temporary files with scaled version of images
|
||||||
self.max_link_levels = max_link_levels #: Number of link levels to process recursively
|
self.max_link_levels = max_link_levels #: Number of link levels to process recursively
|
||||||
self.link_level = link_level #: Current link level
|
self.link_level = link_level #: Current link level
|
||||||
self.blockquote_style = book.create_block_style(sidemargin=60,
|
self.blockquote_style = book.create_block_style(sidemargin=60,
|
||||||
topskip=20, footskip=20)
|
topskip=20, footskip=20)
|
||||||
self.unindented_style = book.create_text_style(parindent=0)
|
self.unindented_style = book.create_text_style(parindent=0)
|
||||||
|
self.text_styles = []#: Keep track of already used textstyles
|
||||||
|
self.block_styles = []#: Keep track of already used blockstyles
|
||||||
self.images = {} #: Images referenced in the HTML document
|
self.images = {} #: Images referenced in the HTML document
|
||||||
self.targets = {} #: <a name=...> elements
|
self.targets = {} #: <a name=...> elements
|
||||||
self.links = [] #: <a href=...> elements
|
self.links = [] #: <a href=...> elements
|
||||||
@ -500,7 +509,9 @@ class HTMLConverter(object):
|
|||||||
font_delta=self.font_delta, verbose=self.verbose,
|
font_delta=self.font_delta, verbose=self.verbose,
|
||||||
link_level=self.link_level+1,
|
link_level=self.link_level+1,
|
||||||
max_link_levels=self.max_link_levels,
|
max_link_levels=self.max_link_levels,
|
||||||
is_root = False, baen=self.baen)
|
is_root = False, baen=self.baen,
|
||||||
|
chapter_detection=self.chapter_detection,
|
||||||
|
chapter_regex=self.chapter_regex)
|
||||||
HTMLConverter.processed_files[path] = self.files[path]
|
HTMLConverter.processed_files[path] = self.files[path]
|
||||||
except Exception:
|
except Exception:
|
||||||
print >>sys.stderr, 'Unable to process', path
|
print >>sys.stderr, 'Unable to process', path
|
||||||
@ -587,6 +598,11 @@ class HTMLConverter(object):
|
|||||||
self.current_block.append_to(self.current_page)
|
self.current_block.append_to(self.current_page)
|
||||||
ts = self.book.create_text_style(**self.current_block.textStyle.attrs)
|
ts = self.book.create_text_style(**self.current_block.textStyle.attrs)
|
||||||
ts.attrs['align'] = align
|
ts.attrs['align'] = align
|
||||||
|
try:
|
||||||
|
index = self.text_styles.index(ts)
|
||||||
|
ts = self.text_styles[index]
|
||||||
|
except ValueError:
|
||||||
|
self.text_styles.append(ts)
|
||||||
self.current_block = self.book.create_text_block(
|
self.current_block = self.book.create_text_block(
|
||||||
blockStyle=self.current_block.blockStyle,
|
blockStyle=self.current_block.blockStyle,
|
||||||
textStyle=ts)
|
textStyle=ts)
|
||||||
@ -851,9 +867,19 @@ class HTMLConverter(object):
|
|||||||
self.current_para = Paragraph()
|
self.current_para = Paragraph()
|
||||||
ts = self.book.create_text_style(**self.current_block.textStyle.attrs)
|
ts = self.book.create_text_style(**self.current_block.textStyle.attrs)
|
||||||
ts.attrs['parindent'] = 0
|
ts.attrs['parindent'] = 0
|
||||||
|
try:
|
||||||
|
index = self.text_styles.index(ts)
|
||||||
|
ts = self.text_styles[index]
|
||||||
|
except ValueError:
|
||||||
|
self.text_styles.append(ts)
|
||||||
bs = self.book.create_block_style(**self.current_block.blockStyle.attrs)
|
bs = self.book.create_block_style(**self.current_block.blockStyle.attrs)
|
||||||
bs.attrs['sidemargin'], bs.attrs['topskip'], bs.attrs['footskip'] = \
|
bs.attrs['sidemargin'], bs.attrs['topskip'], bs.attrs['footskip'] = \
|
||||||
60, 20, 20
|
60, 20, 20
|
||||||
|
try:
|
||||||
|
index = self.block_styles.index(bs)
|
||||||
|
bs = self.block_styles[index]
|
||||||
|
except ValueError:
|
||||||
|
self.block_styles.append(bs)
|
||||||
self.current_block = self.book.create_text_block(
|
self.current_block = self.book.create_text_block(
|
||||||
blockStyle=bs, textStyle=ts)
|
blockStyle=bs, textStyle=ts)
|
||||||
self.process_children(tag, tag_css)
|
self.process_children(tag, tag_css)
|
||||||
@ -863,6 +889,12 @@ class HTMLConverter(object):
|
|||||||
self.current_block = self.book.create_text_block(textStyle=pb.textStyle,
|
self.current_block = self.book.create_text_block(textStyle=pb.textStyle,
|
||||||
blockStyle=pb.blockStyle)
|
blockStyle=pb.blockStyle)
|
||||||
elif tagname in ['p', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
|
elif tagname in ['p', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
|
||||||
|
if self.chapter_detection and tagname.startswith('h'):
|
||||||
|
src = self.get_text(tag)
|
||||||
|
if self.chapter_regex.search(src):
|
||||||
|
if self.verbose:
|
||||||
|
print 'Detected chapter', src
|
||||||
|
self.end_page()
|
||||||
self.end_current_para()
|
self.end_current_para()
|
||||||
self.lstrip_toggle = True
|
self.lstrip_toggle = True
|
||||||
if tag_css.has_key('text-indent'):
|
if tag_css.has_key('text-indent'):
|
||||||
@ -875,6 +907,11 @@ class HTMLConverter(object):
|
|||||||
self.current_block.append_to(self.current_page)
|
self.current_block.append_to(self.current_page)
|
||||||
ts = self.book.create_text_style(**self.current_block.textStyle.attrs)
|
ts = self.book.create_text_style(**self.current_block.textStyle.attrs)
|
||||||
ts.attrs['parindent'] = indent
|
ts.attrs['parindent'] = indent
|
||||||
|
try:
|
||||||
|
index = self.text_styles.index(ts)
|
||||||
|
ts = self.text_styles[index]
|
||||||
|
except ValueError:
|
||||||
|
self.text_styles.append(ts)
|
||||||
self.current_block = self.book.create_text_block(blockStyle=self.current_block.blockStyle,
|
self.current_block = self.book.create_text_block(blockStyle=self.current_block.blockStyle,
|
||||||
textStyle=ts)
|
textStyle=ts)
|
||||||
self.process_children(tag, tag_css)
|
self.process_children(tag, tag_css)
|
||||||
@ -953,7 +990,9 @@ def process_file(path, options):
|
|||||||
conv = HTMLConverter(book, path, dpi=options.dpi,
|
conv = HTMLConverter(book, path, dpi=options.dpi,
|
||||||
font_delta=options.font_delta,
|
font_delta=options.font_delta,
|
||||||
cover=cpath, max_link_levels=options.link_levels,
|
cover=cpath, max_link_levels=options.link_levels,
|
||||||
baen=options.baen)
|
baen=options.baen,
|
||||||
|
chapter_detection=options.chapter_detection,
|
||||||
|
chapter_regex=re.compile(options.chapter_regex, re.IGNORECASE))
|
||||||
conv.process_links()
|
conv.process_links()
|
||||||
oname = options.output
|
oname = options.output
|
||||||
if not oname:
|
if not oname:
|
||||||
@ -984,14 +1023,22 @@ def main():
|
|||||||
dest='font_delta')
|
dest='font_delta')
|
||||||
parser.add_option('--link-levels', action='store', type='int', default=sys.maxint, \
|
parser.add_option('--link-levels', action='store', type='int', default=sys.maxint, \
|
||||||
dest='link_levels',
|
dest='link_levels',
|
||||||
help='''The maximum number of levels to recursively process
|
help=r'''The maximum number of levels to recursively process '''
|
||||||
links. A value of 0 means thats links are not followed.
|
'''links. A value of 0 means thats links are not followed. '''
|
||||||
A negative value means that <a> tags are ignored.''')
|
'''A negative value means that <a> tags are ignored.''')
|
||||||
parser.add_option('--baen', action='store_true', default=False, dest='baen',
|
parser.add_option('--baen', action='store_true', default=False, dest='baen',
|
||||||
help='''Preprocess Baen HTML files to improve generated LRF.''')
|
help='''Preprocess Baen HTML files to improve generated LRF.''')
|
||||||
parser.add_option('--dpi', action='store', type='int', default=166, dest='dpi',
|
parser.add_option('--dpi', action='store', type='int', default=166, dest='dpi',
|
||||||
help='''The DPI of the target device. Default is 166 for the
|
help='''The DPI of the target device. Default is 166 for the
|
||||||
Sony PRS 500''')
|
Sony PRS 500''')
|
||||||
|
parser.add_option('--disable-chapter-detection', action='store_false',
|
||||||
|
default=True, dest='chapter_detection',
|
||||||
|
help='''Prevent html2lrf from automatically inserting page breaks'''
|
||||||
|
'''before what it thinks are chapters.''')
|
||||||
|
parser.add_option('--chapter-regex', dest='chapter_regex',
|
||||||
|
default='chapter|book|appendix',
|
||||||
|
help='''The regular expression used to detect chapter titles.'''
|
||||||
|
'''It is searched for in heading tags. Default is chapter|book|appendix''')
|
||||||
options, args = parser.parse_args()
|
options, args = parser.parse_args()
|
||||||
if len(args) != 1:
|
if len(args) != 1:
|
||||||
parser.print_help()
|
parser.print_help()
|
||||||
|
@ -1096,6 +1096,10 @@ class LrsStyle(LrsObject, LrsAttributes, LrsContainer):
|
|||||||
obj.appendTagDict(self.attrs, self.__class__.__name__)
|
obj.appendTagDict(self.attrs, self.__class__.__name__)
|
||||||
lrfWriter.append(obj)
|
lrfWriter.append(obj)
|
||||||
|
|
||||||
|
def __eq__(self, other):
|
||||||
|
if hasattr(other, 'attrs'):
|
||||||
|
return self.__class__ == other.__class__ and self.attrs == other.attrs
|
||||||
|
return False
|
||||||
|
|
||||||
class TextStyle(LrsStyle):
|
class TextStyle(LrsStyle):
|
||||||
"""
|
"""
|
||||||
|
Loading…
x
Reference in New Issue
Block a user