mirror of
				https://github.com/kovidgoyal/calibre.git
				synced 2025-11-04 03:27:00 -05:00 
			
		
		
		
	Change unsmarten option to be an oeb transform and make it handle more cases and use a simplier implmentation.
This commit is contained in:
		
							parent
							
								
									ec448064aa
								
							
						
					
					
						commit
						1ce4a97f63
					
				@ -1025,6 +1025,10 @@ OptionRecommendation(name='sr3_replace',
 | 
				
			|||||||
            from calibre.ebooks.oeb.transforms.linearize_tables import LinearizeTables
 | 
					            from calibre.ebooks.oeb.transforms.linearize_tables import LinearizeTables
 | 
				
			||||||
            LinearizeTables()(self.oeb, self.opts)
 | 
					            LinearizeTables()(self.oeb, self.opts)
 | 
				
			||||||
            
 | 
					            
 | 
				
			||||||
 | 
					        if self.opts.unsmarten_punctuation:
 | 
				
			||||||
 | 
					            from calibre.ebooks.oeb.transforms.unsmarten import UnsmartenPunctuation
 | 
				
			||||||
 | 
					            UnsmartenPunctuation()(self.oeb, self.opts)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        flattener = CSSFlattener(fbase=fbase, fkey=fkey,
 | 
					        flattener = CSSFlattener(fbase=fbase, fkey=fkey,
 | 
				
			||||||
                lineh=line_height,
 | 
					                lineh=line_height,
 | 
				
			||||||
                untable=self.output_plugin.file_type in ('mobi','lit'),
 | 
					                untable=self.output_plugin.file_type in ('mobi','lit'),
 | 
				
			||||||
 | 
				
			|||||||
@ -606,9 +606,6 @@ class HTMLPreProcessor(object):
 | 
				
			|||||||
        if getattr(self.extra_opts, 'smarten_punctuation', False):
 | 
					        if getattr(self.extra_opts, 'smarten_punctuation', False):
 | 
				
			||||||
            html = self.smarten_punctuation(html)
 | 
					            html = self.smarten_punctuation(html)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        if getattr(self.extra_opts, 'unsmarten_punctuation', False):
 | 
					 | 
				
			||||||
            html = self.unsmarten_punctuation(html)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        unsupported_unicode_chars = self.extra_opts.output_profile.unsupported_unicode_chars
 | 
					        unsupported_unicode_chars = self.extra_opts.output_profile.unsupported_unicode_chars
 | 
				
			||||||
        if unsupported_unicode_chars:
 | 
					        if unsupported_unicode_chars:
 | 
				
			||||||
            from calibre.utils.localization import get_udc
 | 
					            from calibre.utils.localization import get_udc
 | 
				
			||||||
 | 
				
			|||||||
							
								
								
									
										25
									
								
								src/calibre/ebooks/oeb/transforms/unsmarten.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										25
									
								
								src/calibre/ebooks/oeb/transforms/unsmarten.py
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,25 @@
 | 
				
			|||||||
 | 
					# -*- coding: utf-8 -*-
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from __future__ import (unicode_literals, division, absolute_import, print_function)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					__license__ = 'GPL 3'
 | 
				
			||||||
 | 
					__copyright__ = '2011, John Schember <john@nachtimwald.com>'
 | 
				
			||||||
 | 
					__docformat__ = 'restructuredtext en'
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from calibre.ebooks.oeb.base import OEB_DOCS, XPath, barename
 | 
				
			||||||
 | 
					from calibre.utils.unsmarten import unsmarten_text
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					class UnsmartenPunctuation(object):
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    def unsmarten(self, root):
 | 
				
			||||||
 | 
					        for x in XPath('//h:*')(root):
 | 
				
			||||||
 | 
					            if not barename(x) == 'pre':
 | 
				
			||||||
 | 
					                if hasattr(x, 'text') and x.text:
 | 
				
			||||||
 | 
					                    x.text = unsmarten_text(x.text)
 | 
				
			||||||
 | 
					                if hasattr(x, 'tail') and x.tail:
 | 
				
			||||||
 | 
					                    x.tail = unsmarten_text(x.tail)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def __call__(self, oeb, context):
 | 
				
			||||||
 | 
					        for x in oeb.manifest.items:
 | 
				
			||||||
 | 
					            if x.media_type in OEB_DOCS:
 | 
				
			||||||
 | 
					                self.unsmarten(x.data)
 | 
				
			||||||
@ -8,50 +8,6 @@ __docformat__ = 'restructuredtext en'
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
import re
 | 
					import re
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from lxml import html as lhtml
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
from calibre import prepare_string_for_xml
 | 
					 | 
				
			||||||
from calibre.ebooks.oeb.base import barename
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def unsmarten_html(html):
 | 
					 | 
				
			||||||
    def dump_text(elem):
 | 
					 | 
				
			||||||
        text = []
 | 
					 | 
				
			||||||
        tags = []
 | 
					 | 
				
			||||||
        tag = barename(elem.tag)
 | 
					 | 
				
			||||||
        attribs = elem.attrib
 | 
					 | 
				
			||||||
        tags.append(tag)
 | 
					 | 
				
			||||||
        # Turn the attributes into a string we can write with the tag.
 | 
					 | 
				
			||||||
        at = ''
 | 
					 | 
				
			||||||
        for k, v in attribs.items():
 | 
					 | 
				
			||||||
            at += ' %s="%s"' % (k, prepare_string_for_xml(v, attribute=True))
 | 
					 | 
				
			||||||
        # Write the tag.
 | 
					 | 
				
			||||||
        text.append('<%s%s>' % (tag, at))
 | 
					 | 
				
			||||||
        # Process tags that contain text.
 | 
					 | 
				
			||||||
        if hasattr(elem, 'text') and elem.text:
 | 
					 | 
				
			||||||
            # Don't modify text in pre tags.
 | 
					 | 
				
			||||||
            if tag == 'pre':
 | 
					 | 
				
			||||||
                text.append(elem.text)
 | 
					 | 
				
			||||||
            else:
 | 
					 | 
				
			||||||
                text.append(prepare_string_for_xml(unsmarten_text(elem.text)))
 | 
					 | 
				
			||||||
        # Recurse down into tags within the tag we are in.
 | 
					 | 
				
			||||||
        for item in elem:
 | 
					 | 
				
			||||||
            text += dump_text(item)
 | 
					 | 
				
			||||||
        # Close all open tags.
 | 
					 | 
				
			||||||
        tags.reverse()
 | 
					 | 
				
			||||||
        for t in tags:
 | 
					 | 
				
			||||||
            text.append('</%s>' % t)
 | 
					 | 
				
			||||||
        # Add the text that is outside of the tag.
 | 
					 | 
				
			||||||
        if hasattr(elem, 'tail') and elem.tail:
 | 
					 | 
				
			||||||
            text.append(prepare_string_for_xml(unsmarten_text(elem.tail)))
 | 
					 | 
				
			||||||
        return text
 | 
					 | 
				
			||||||
    
 | 
					 | 
				
			||||||
    content = lhtml.fromstring(html)
 | 
					 | 
				
			||||||
    html = dump_text(content)
 | 
					 | 
				
			||||||
    html = ''.join(html)
 | 
					 | 
				
			||||||
    
 | 
					 | 
				
			||||||
    return html
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def unsmarten_text(txt):
 | 
					def unsmarten_text(txt):
 | 
				
			||||||
    txt = re.sub(u'–|–|–', r'--', txt) # en-dash
 | 
					    txt = re.sub(u'–|–|–', r'--', txt) # en-dash
 | 
				
			||||||
    txt = re.sub(u'—|—|—', r'---', txt) # em-dash
 | 
					    txt = re.sub(u'—|—|—', r'---', txt) # em-dash
 | 
				
			||||||
 | 
				
			|||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user