From b49617d5028408fe5428631b9e6a4fc877c69292 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Thu, 10 May 2007 18:15:58 +0000 Subject: [PATCH] Added automatic chapter detection. Prevent creation of redundant TextStyle and BlockStyle elements. --- src/libprs500/__init__.py | 2 +- src/libprs500/lrf/html/convert_from.py | 61 +++++++++++++++++++++++--- src/libprs500/lrf/pylrs/pylrs.py | 4 ++ 3 files changed, 59 insertions(+), 8 deletions(-) diff --git a/src/libprs500/__init__.py b/src/libprs500/__init__.py index 32078f9c18..604eadb1f5 100644 --- a/src/libprs500/__init__.py +++ b/src/libprs500/__init__.py @@ -33,7 +33,7 @@ You may have to adjust the GROUP and the location of the rules file to suit your distribution. """ -__version__ = "0.3.22" +__version__ = "0.3.23" __docformat__ = "epytext" __author__ = "Kovid Goyal " diff --git a/src/libprs500/lrf/html/convert_from.py b/src/libprs500/lrf/html/convert_from.py index 69fb85fb6e..74d1240c37 100644 --- a/src/libprs500/lrf/html/convert_from.py +++ b/src/libprs500/lrf/html/convert_from.py @@ -252,7 +252,8 @@ class HTMLConverter(object): def __init__(self, book, path, dpi=166, width=575, height=747, font_delta=0, verbose=False, cover=None, max_link_levels=sys.maxint, link_level=0, - is_root=True, baen=False): + is_root=True, baen=False, chapter_detection=True, + chapter_regex=re.compile('chapter|book|appendix', re.IGNORECASE)): ''' Convert HTML file at C{path} and add it to C{book}. After creating the object, you must call L{self.process_links} on it to create the links and @@ -278,16 +279,24 @@ class HTMLConverter(object): @type link_level: C{int} @param is_root: True iff this object is converting the root HTML file @type is_root: C{bool} + @param chapter_detection: Insert page breaks before what looks like + the start of a chapter + @type chapter_detection: C{bool} + @param chapter_regex: The compiled regular expression used to search for chapter titles ''' self.page_width = width #: The width of the page self.page_height = height #: The height of the page self.dpi = dpi #: The DPI of the intended display device + self.chapter_detection = chapter_detection #: Flag to toggle chapter detection + self.chapter_regex = chapter_regex #: Regex used to search for chapter titles self.scaled_images = {} #: Temporary files with scaled version of images self.max_link_levels = max_link_levels #: Number of link levels to process recursively self.link_level = link_level #: Current link level self.blockquote_style = book.create_block_style(sidemargin=60, topskip=20, footskip=20) self.unindented_style = book.create_text_style(parindent=0) + self.text_styles = []#: Keep track of already used textstyles + self.block_styles = []#: Keep track of already used blockstyles self.images = {} #: Images referenced in the HTML document self.targets = {} #: elements self.links = [] #: elements @@ -500,7 +509,9 @@ class HTMLConverter(object): font_delta=self.font_delta, verbose=self.verbose, link_level=self.link_level+1, max_link_levels=self.max_link_levels, - is_root = False, baen=self.baen) + is_root = False, baen=self.baen, + chapter_detection=self.chapter_detection, + chapter_regex=self.chapter_regex) HTMLConverter.processed_files[path] = self.files[path] except Exception: print >>sys.stderr, 'Unable to process', path @@ -587,6 +598,11 @@ class HTMLConverter(object): self.current_block.append_to(self.current_page) ts = self.book.create_text_style(**self.current_block.textStyle.attrs) ts.attrs['align'] = align + try: + index = self.text_styles.index(ts) + ts = self.text_styles[index] + except ValueError: + self.text_styles.append(ts) self.current_block = self.book.create_text_block( blockStyle=self.current_block.blockStyle, textStyle=ts) @@ -851,9 +867,19 @@ class HTMLConverter(object): self.current_para = Paragraph() ts = self.book.create_text_style(**self.current_block.textStyle.attrs) ts.attrs['parindent'] = 0 + try: + index = self.text_styles.index(ts) + ts = self.text_styles[index] + except ValueError: + self.text_styles.append(ts) bs = self.book.create_block_style(**self.current_block.blockStyle.attrs) bs.attrs['sidemargin'], bs.attrs['topskip'], bs.attrs['footskip'] = \ 60, 20, 20 + try: + index = self.block_styles.index(bs) + bs = self.block_styles[index] + except ValueError: + self.block_styles.append(bs) self.current_block = self.book.create_text_block( blockStyle=bs, textStyle=ts) self.process_children(tag, tag_css) @@ -863,6 +889,12 @@ class HTMLConverter(object): self.current_block = self.book.create_text_block(textStyle=pb.textStyle, blockStyle=pb.blockStyle) elif tagname in ['p', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']: + if self.chapter_detection and tagname.startswith('h'): + src = self.get_text(tag) + if self.chapter_regex.search(src): + if self.verbose: + print 'Detected chapter', src + self.end_page() self.end_current_para() self.lstrip_toggle = True if tag_css.has_key('text-indent'): @@ -875,6 +907,11 @@ class HTMLConverter(object): self.current_block.append_to(self.current_page) ts = self.book.create_text_style(**self.current_block.textStyle.attrs) ts.attrs['parindent'] = indent + try: + index = self.text_styles.index(ts) + ts = self.text_styles[index] + except ValueError: + self.text_styles.append(ts) self.current_block = self.book.create_text_block(blockStyle=self.current_block.blockStyle, textStyle=ts) self.process_children(tag, tag_css) @@ -953,7 +990,9 @@ def process_file(path, options): conv = HTMLConverter(book, path, dpi=options.dpi, font_delta=options.font_delta, cover=cpath, max_link_levels=options.link_levels, - baen=options.baen) + baen=options.baen, + chapter_detection=options.chapter_detection, + chapter_regex=re.compile(options.chapter_regex, re.IGNORECASE)) conv.process_links() oname = options.output if not oname: @@ -984,14 +1023,22 @@ def main(): dest='font_delta') parser.add_option('--link-levels', action='store', type='int', default=sys.maxint, \ dest='link_levels', - help='''The maximum number of levels to recursively process - links. A value of 0 means thats links are not followed. - A negative value means that tags are ignored.''') + help=r'''The maximum number of levels to recursively process ''' + '''links. A value of 0 means thats links are not followed. ''' + '''A negative value means that tags are ignored.''') parser.add_option('--baen', action='store_true', default=False, dest='baen', help='''Preprocess Baen HTML files to improve generated LRF.''') parser.add_option('--dpi', action='store', type='int', default=166, dest='dpi', help='''The DPI of the target device. Default is 166 for the - Sony PRS 500''') + Sony PRS 500''') + parser.add_option('--disable-chapter-detection', action='store_false', + default=True, dest='chapter_detection', + help='''Prevent html2lrf from automatically inserting page breaks''' + '''before what it thinks are chapters.''') + parser.add_option('--chapter-regex', dest='chapter_regex', + default='chapter|book|appendix', + help='''The regular expression used to detect chapter titles.''' + '''It is searched for in heading tags. Default is chapter|book|appendix''') options, args = parser.parse_args() if len(args) != 1: parser.print_help() diff --git a/src/libprs500/lrf/pylrs/pylrs.py b/src/libprs500/lrf/pylrs/pylrs.py index f35bc6752e..d456406c53 100644 --- a/src/libprs500/lrf/pylrs/pylrs.py +++ b/src/libprs500/lrf/pylrs/pylrs.py @@ -1096,6 +1096,10 @@ class LrsStyle(LrsObject, LrsAttributes, LrsContainer): obj.appendTagDict(self.attrs, self.__class__.__name__) lrfWriter.append(obj) + def __eq__(self, other): + if hasattr(other, 'attrs'): + return self.__class__ == other.__class__ and self.attrs == other.attrs + return False class TextStyle(LrsStyle): """