mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Refactor option handling and add support for --blank-after-para
This commit is contained in:
parent
b6075d3dd6
commit
5cd2492e54
@ -91,7 +91,7 @@ def option_parser(usage):
|
|||||||
profiles=['prs500']
|
profiles=['prs500']
|
||||||
parser.add_option('-o', '--output', action='store', default=None, \
|
parser.add_option('-o', '--output', action='store', default=None, \
|
||||||
help='Output file name. Default is derived from input filename')
|
help='Output file name. Default is derived from input filename')
|
||||||
parser.add_option('--ignore-tables', action='store_true', default=False,
|
parser.add_option('--ignore-tables', action='store_true', default=False, dest='ignore_tables',
|
||||||
help='Render HTML tables as blocks of text instead of actual tables. This is neccessary if the HTML contains very large or complex tables.')
|
help='Render HTML tables as blocks of text instead of actual tables. This is neccessary if the HTML contains very large or complex tables.')
|
||||||
laf = parser.add_option_group('LOOK AND FEEL')
|
laf = parser.add_option_group('LOOK AND FEEL')
|
||||||
laf.add_option('--cover', action='store', dest='cover', default=None, \
|
laf.add_option('--cover', action='store', dest='cover', default=None, \
|
||||||
@ -105,6 +105,8 @@ def option_parser(usage):
|
|||||||
help='Disable autorotation of images.', dest='disable_autorotation')
|
help='Disable autorotation of images.', dest='disable_autorotation')
|
||||||
laf.add_option('--wordspace', dest='wordspace', default=2.5, type='float',
|
laf.add_option('--wordspace', dest='wordspace', default=2.5, type='float',
|
||||||
help='Set the space between words in pts. Default is %default')
|
help='Set the space between words in pts. Default is %default')
|
||||||
|
laf.add_option('--blank-after-para', action='store_true', default=False,
|
||||||
|
dest='blank_after_para', help='Separate paragraphs by blank lines.')
|
||||||
page = parser.add_option_group('PAGE OPTIONS')
|
page = parser.add_option_group('PAGE OPTIONS')
|
||||||
page.add_option('-p', '--profile', default=PRS500_PROFILE, dest='profile', type='choice',
|
page.add_option('-p', '--profile', default=PRS500_PROFILE, dest='profile', type='choice',
|
||||||
choices=profiles, action='callback', callback=profile_from_string,
|
choices=profiles, action='callback', callback=profile_from_string,
|
||||||
|
@ -272,18 +272,23 @@ class HTMLConverter(object):
|
|||||||
|
|
||||||
processed_files = {} #: Files that have been processed
|
processed_files = {} #: Files that have been processed
|
||||||
|
|
||||||
def __init__(self, book, fonts, path,
|
def __hasattr__(self, attr):
|
||||||
font_delta=0, verbose=False, cover=None,
|
if hasattr(self.options, attr):
|
||||||
max_link_levels=sys.maxint, link_level=0,
|
return True
|
||||||
is_root=True, baen=False, chapter_detection=True,
|
return object.__hasattr__(self, attr)
|
||||||
chapter_regex=re.compile('chapter|book|appendix', re.IGNORECASE),
|
|
||||||
link_exclude=re.compile('$'),
|
def __getattr__(self, attr):
|
||||||
page_break=re.compile('h[12]', re.IGNORECASE),
|
if hasattr(self.options, attr):
|
||||||
force_page_break=re.compile('$', re.IGNORECASE),
|
return getattr(self.options, attr)
|
||||||
profile=PRS500_PROFILE,
|
return object.__getattr__(self, attr)
|
||||||
disable_autorotation=False,
|
|
||||||
ignore_tables=False,
|
def __setattr__(self, attr, val):
|
||||||
pdftohtml=False):
|
if hasattr(self.options, attr):
|
||||||
|
setattr(self.options, attr, val)
|
||||||
|
else:
|
||||||
|
object.__setattr__(self, attr, val)
|
||||||
|
|
||||||
|
def __init__(self, book, fonts, path, options, link_level=0, is_root=True):
|
||||||
'''
|
'''
|
||||||
Convert HTML file at C{path} and add it to C{book}. After creating
|
Convert HTML file at C{path} and add it to C{book}. After creating
|
||||||
the object, you must call L{self.process_links} on it to create the links and
|
the object, you must call L{self.process_links} on it to create the links and
|
||||||
@ -294,34 +299,9 @@ class HTMLConverter(object):
|
|||||||
@param fonts: dict specifying the font families to use
|
@param fonts: dict specifying the font families to use
|
||||||
@param path: path to the HTML file to process
|
@param path: path to the HTML file to process
|
||||||
@type path: C{str}
|
@type path: C{str}
|
||||||
@param width: Width of the device on which the LRF file is to be read
|
|
||||||
@type width: C{int}
|
|
||||||
@param height: Height of the device on which the LRF file is to be read
|
|
||||||
@type height: C{int}
|
|
||||||
@param font_delta: The amount in pts by which all fonts should be changed
|
|
||||||
@type font_delta: C{int}
|
|
||||||
@param verbose: Whether processing should be verbose or not
|
|
||||||
@type verbose: C{bool}
|
|
||||||
@param cover: Path to an image to use as the cover of this book
|
|
||||||
@type cover: C{str}
|
|
||||||
@param max_link_levels: Number of link levels to process recursively
|
|
||||||
@type max_link_levels: C{int}
|
|
||||||
@param link_level: Current link level
|
|
||||||
@type link_level: C{int}
|
|
||||||
@param is_root: True iff this object is converting the root HTML file
|
|
||||||
@type is_root: C{bool}
|
|
||||||
@param chapter_detection: Insert page breaks before what looks like
|
|
||||||
the start of a chapter
|
|
||||||
@type chapter_detection: C{bool}
|
|
||||||
@param chapter_regex: The compiled regular expression used to search for chapter titles
|
|
||||||
@param link_exclude: Compiled regex. Matching hrefs are ignored.
|
|
||||||
@param page_break: Compiled regex. Page breaks are inserted before matching
|
|
||||||
tags if no page-breaks are found and no chapter headings
|
|
||||||
are detected.
|
|
||||||
@param profile: Defines the geometry of the display device
|
|
||||||
@param disable_autorotation: Don't autorotate very wide images
|
|
||||||
'''
|
'''
|
||||||
# Defaults for various formatting tags
|
# Defaults for various formatting tags
|
||||||
|
object.__setattr__(self, 'options', options)
|
||||||
self.css = dict(
|
self.css = dict(
|
||||||
h1 = {"font-size" : "xx-large", "font-weight":"bold", 'text-indent':'0pt'},
|
h1 = {"font-size" : "xx-large", "font-weight":"bold", 'text-indent':'0pt'},
|
||||||
h2 = {"font-size" : "x-large", "font-weight":"bold", 'text-indent':'0pt'},
|
h2 = {"font-size" : "x-large", "font-weight":"bold", 'text-indent':'0pt'},
|
||||||
@ -341,22 +321,14 @@ class HTMLConverter(object):
|
|||||||
th = {'font-size' : 'large', 'font-weight':'bold'},
|
th = {'font-size' : 'large', 'font-weight':'bold'},
|
||||||
big = {'font-size' : 'large', 'font-weight':'bold'},
|
big = {'font-size' : 'large', 'font-weight':'bold'},
|
||||||
)
|
)
|
||||||
self.css['.libprs500_dropcaps'] = {'font-size': 'xx-large'}
|
self.css['.libprs500_dropcaps'] = {'font-size': 'xx-large'}
|
||||||
self.fonts = fonts #: dict specifting font families to use
|
self.fonts = fonts #: dict specifting font families to use
|
||||||
self.profile = profile #: Defines the geometry of the display device
|
|
||||||
self.chapter_detection = chapter_detection #: Flag to toggle chapter detection
|
|
||||||
self.chapter_regex = chapter_regex #: Regex used to search for chapter titles
|
|
||||||
self.link_exclude = link_exclude #: Ignore matching hrefs
|
|
||||||
self.scaled_images = {} #: Temporary files with scaled version of images
|
self.scaled_images = {} #: Temporary files with scaled version of images
|
||||||
self.rotated_images = {} #: Temporary files with rotated version of images
|
self.rotated_images = {} #: Temporary files with rotated version of images
|
||||||
self.max_link_levels = max_link_levels #: Number of link levels to process recursively
|
|
||||||
self.link_level = link_level #: Current link level
|
self.link_level = link_level #: Current link level
|
||||||
self.disable_autorotation = disable_autorotation
|
|
||||||
self.blockquote_style = book.create_block_style(sidemargin=60,
|
self.blockquote_style = book.create_block_style(sidemargin=60,
|
||||||
topskip=20, footskip=20)
|
topskip=20, footskip=20)
|
||||||
self.unindented_style = book.create_text_style(parindent=0)
|
self.unindented_style = book.create_text_style(parindent=0)
|
||||||
self.page_break = page_break #: Regex controlling page-break behavior
|
|
||||||
self.force_page_break = force_page_break #: Regex controlling forced page-break behavior
|
|
||||||
self.text_styles = []#: Keep track of already used textstyles
|
self.text_styles = []#: Keep track of already used textstyles
|
||||||
self.block_styles = []#: Keep track of already used blockstyles
|
self.block_styles = []#: Keep track of already used blockstyles
|
||||||
self.images = {} #: Images referenced in the HTML document
|
self.images = {} #: Images referenced in the HTML document
|
||||||
@ -364,12 +336,9 @@ class HTMLConverter(object):
|
|||||||
self.links = [] #: <a href=...> elements
|
self.links = [] #: <a href=...> elements
|
||||||
self.files = {} #: links that point to other files
|
self.files = {} #: links that point to other files
|
||||||
self.links_processed = False #: Whether links_processed has been called on this object
|
self.links_processed = False #: Whether links_processed has been called on this object
|
||||||
self.font_delta = font_delta
|
|
||||||
self.ignore_tables = ignore_tables
|
|
||||||
# Set by table processing code so that any <a name> within the table
|
# Set by table processing code so that any <a name> within the table
|
||||||
# point to the previous element
|
# point to the previous element
|
||||||
self.anchor_to_previous = None
|
self.anchor_to_previous = None
|
||||||
self.cover = cover
|
|
||||||
self.in_table = False
|
self.in_table = False
|
||||||
self.list_level = 0
|
self.list_level = 0
|
||||||
self.list_indent = 20
|
self.list_indent = 20
|
||||||
@ -386,21 +355,18 @@ class HTMLConverter(object):
|
|||||||
sys.stdout.flush()
|
sys.stdout.flush()
|
||||||
nmassage = copy.copy(BeautifulSoup.MARKUP_MASSAGE)
|
nmassage = copy.copy(BeautifulSoup.MARKUP_MASSAGE)
|
||||||
nmassage.extend(HTMLConverter.MARKUP_MASSAGE)
|
nmassage.extend(HTMLConverter.MARKUP_MASSAGE)
|
||||||
self.baen = baen
|
if self.baen:
|
||||||
self.pdftohtml = pdftohtml
|
|
||||||
if baen:
|
|
||||||
nmassage.extend(HTMLConverter.BAEN_SANCTIFY)
|
nmassage.extend(HTMLConverter.BAEN_SANCTIFY)
|
||||||
|
|
||||||
raw = open(self.file_name, 'rb').read()
|
raw = open(self.file_name, 'rb').read()
|
||||||
if pdftohtml:
|
if self.pdftohtml:
|
||||||
nmassage.extend(HTMLConverter.PDFTOHTML)
|
nmassage.extend(HTMLConverter.PDFTOHTML)
|
||||||
raw = unicode(raw, 'utf8', 'replace')
|
raw = unicode(raw, 'utf8', 'replace')
|
||||||
self.soup = BeautifulSoup(raw,
|
self.soup = BeautifulSoup(raw,
|
||||||
convertEntities=BeautifulSoup.HTML_ENTITIES,
|
convertEntities=BeautifulSoup.HTML_ENTITIES,
|
||||||
markupMassage=nmassage)
|
markupMassage=nmassage)
|
||||||
print 'done\n\tConverting to BBeB...',
|
print 'done\n\tConverting to BBeB...',
|
||||||
sys.stdout.flush()
|
sys.stdout.flush()
|
||||||
self.verbose = verbose
|
|
||||||
self.current_page = None
|
self.current_page = None
|
||||||
self.current_para = None
|
self.current_para = None
|
||||||
self.current_style = {}
|
self.current_style = {}
|
||||||
@ -618,7 +584,7 @@ class HTMLConverter(object):
|
|||||||
cb = CharButton(jb, text=text)
|
cb = CharButton(jb, text=text)
|
||||||
para.contents = []
|
para.contents = []
|
||||||
para.append(cb)
|
para.append(cb)
|
||||||
elif self.link_level < self.max_link_levels:
|
elif self.link_level < self.link_levels:
|
||||||
try: # os.access raises Exceptions in path has null bytes
|
try: # os.access raises Exceptions in path has null bytes
|
||||||
if not os.access(path.encode('utf8', 'replace'), os.R_OK):
|
if not os.access(path.encode('utf8', 'replace'), os.R_OK):
|
||||||
continue
|
continue
|
||||||
@ -630,20 +596,9 @@ class HTMLConverter(object):
|
|||||||
if not path in HTMLConverter.processed_files.keys():
|
if not path in HTMLConverter.processed_files.keys():
|
||||||
try:
|
try:
|
||||||
self.files[path] = HTMLConverter(
|
self.files[path] = HTMLConverter(
|
||||||
self.book, self.fonts, path,
|
self.book, self.fonts, path, self.options,
|
||||||
profile=self.profile,
|
link_level = self.link_level+1,
|
||||||
font_delta=self.font_delta, verbose=self.verbose,
|
is_root = False,)
|
||||||
link_level=self.link_level+1,
|
|
||||||
max_link_levels=self.max_link_levels,
|
|
||||||
is_root = False, baen=self.baen,
|
|
||||||
chapter_detection=self.chapter_detection,
|
|
||||||
chapter_regex=self.chapter_regex,
|
|
||||||
link_exclude=self.link_exclude,
|
|
||||||
page_break=self.page_break,
|
|
||||||
force_page_break=self.force_page_break,
|
|
||||||
disable_autorotation=self.disable_autorotation,
|
|
||||||
ignore_tables=self.ignore_tables,
|
|
||||||
pdftohtml=self.pdftohtml)
|
|
||||||
HTMLConverter.processed_files[path] = self.files[path]
|
HTMLConverter.processed_files[path] = self.files[path]
|
||||||
except Exception:
|
except Exception:
|
||||||
print >>sys.stderr, 'Unable to process', path
|
print >>sys.stderr, 'Unable to process', path
|
||||||
@ -969,7 +924,7 @@ class HTMLConverter(object):
|
|||||||
print 'Forcing page break at', tagname
|
print 'Forcing page break at', tagname
|
||||||
if tagname in ["title", "script", "meta", 'del', 'frameset']:
|
if tagname in ["title", "script", "meta", 'del', 'frameset']:
|
||||||
pass
|
pass
|
||||||
elif tagname == 'a' and self.max_link_levels >= 0:
|
elif tagname == 'a' and self.link_levels >= 0:
|
||||||
if tag.has_key('href') and not self.link_exclude.match(tag['href']):
|
if tag.has_key('href') and not self.link_exclude.match(tag['href']):
|
||||||
purl = urlparse(tag['href'])
|
purl = urlparse(tag['href'])
|
||||||
path = unquote(purl[2])
|
path = unquote(purl[2])
|
||||||
@ -1212,7 +1167,7 @@ class HTMLConverter(object):
|
|||||||
textStyle=ts)
|
textStyle=ts)
|
||||||
self.process_children(tag, tag_css)
|
self.process_children(tag, tag_css)
|
||||||
self.end_current_para()
|
self.end_current_para()
|
||||||
if tagname.startswith('h'):
|
if tagname.startswith('h') or self.blank_after_para:
|
||||||
self.current_block.append(CR())
|
self.current_block.append(CR())
|
||||||
if tag.has_key('id'):
|
if tag.has_key('id'):
|
||||||
self.targets[tag['id']] = self.current_block
|
self.targets[tag['id']] = self.current_block
|
||||||
@ -1325,16 +1280,12 @@ def process_file(path, options):
|
|||||||
re.compile('$')
|
re.compile('$')
|
||||||
fpb = re.compile(options.force_page_break, re.IGNORECASE) if options.force_page_break else \
|
fpb = re.compile(options.force_page_break, re.IGNORECASE) if options.force_page_break else \
|
||||||
re.compile('$')
|
re.compile('$')
|
||||||
conv = HTMLConverter(book, fonts, path, profile=options.profile,
|
options.cover = cpath
|
||||||
font_delta=options.font_delta,
|
options.force_page_break = fpb
|
||||||
cover=cpath, max_link_levels=options.link_levels,
|
options.link_exclude = le
|
||||||
verbose=options.verbose, baen=options.baen,
|
options.page_break = pb
|
||||||
chapter_detection=options.chapter_detection,
|
options.chapter_regex = re.compile(options.chapter_regex, re.IGNORECASE)
|
||||||
chapter_regex=re.compile(options.chapter_regex, re.IGNORECASE),
|
conv = HTMLConverter(book, fonts, path, options)
|
||||||
link_exclude=re.compile(le), page_break=pb, force_page_break=fpb,
|
|
||||||
disable_autorotation=options.disable_autorotation,
|
|
||||||
ignore_tables=options.ignore_tables,
|
|
||||||
pdftohtml=options.pdftohtml)
|
|
||||||
conv.process_links()
|
conv.process_links()
|
||||||
oname = options.output
|
oname = options.output
|
||||||
if not oname:
|
if not oname:
|
||||||
|
Loading…
x
Reference in New Issue
Block a user