mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
any2txt converter
This commit is contained in:
parent
94b7149180
commit
36fd295ca1
219
src/calibre/ebooks/htmlsymbols.py
Normal file
219
src/calibre/ebooks/htmlsymbols.py
Normal file
@ -0,0 +1,219 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
'''
|
||||||
|
Maping of non-acii symbols and their corresponding html entity number and name
|
||||||
|
'''
|
||||||
|
__license__ = 'GPL v3'
|
||||||
|
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
|
||||||
|
|
||||||
|
# http://www.w3schools.com/tags/ref_symbols.asp
|
||||||
|
HTML_SYMBOLS = {
|
||||||
|
# Math Symbols
|
||||||
|
u'∀' : ['∀', '∀'], # for all
|
||||||
|
u'∂' : ['∂', '∂'], # part
|
||||||
|
u'∃' : ['∃', '&exists;'], # exists
|
||||||
|
u'∅' : ['∅', '∅'], # empty
|
||||||
|
u'∇' : ['∇', '∇'], # nabla
|
||||||
|
u'∈' : ['∈', '∈'], # isin
|
||||||
|
u'∉' : ['∉', '∉'], # notin
|
||||||
|
u'∋' : ['∋', '∋'], # ni
|
||||||
|
u'∏' : ['∏', '∏'], # prod
|
||||||
|
u'∑' : ['∑', '∑'], # sum
|
||||||
|
u'−' : ['−', '−'], # minus
|
||||||
|
u'∗' : ['∗', '∗'], # lowast
|
||||||
|
u'√' : ['√', '√'], # square root
|
||||||
|
u'∝' : ['∝', '∝'], # proportional to
|
||||||
|
u'∞' : ['∞', '∞'], # infinity
|
||||||
|
u'∠' : ['∠', '∠'], # angle
|
||||||
|
u'∧' : ['∧', '∧'], # and
|
||||||
|
u'∨' : ['∨', '∨'], # or
|
||||||
|
u'∩' : ['∩', '∩'], # cap
|
||||||
|
u'∪' : ['∪', '∪'], # cup
|
||||||
|
u'∫' : ['∫', '∫'], # integral
|
||||||
|
u'∴' : ['∴', '∴'], # therefore
|
||||||
|
u'∼' : ['∼', '∼'], # simular to
|
||||||
|
u'≅' : ['≅', '≅'], # approximately equal
|
||||||
|
u'≈' : ['≈', '≈'], # almost equal
|
||||||
|
u'≠' : ['≠', '≠'], # not equal
|
||||||
|
u'≡' : ['≡', '≡'], # equivalent
|
||||||
|
u'≤' : ['≤', '≤'], # less or equal
|
||||||
|
u'≥' : ['≥', '≥'], # greater or equal
|
||||||
|
u'⊂' : ['⊂', '⊂'], # subset of
|
||||||
|
u'⊃' : ['⊃', '⊃'], # superset of
|
||||||
|
u'⊄' : ['⊄', '⊄'], # not subset of
|
||||||
|
u'⊆' : ['⊆', '⊆'], # subset or equal
|
||||||
|
u'⊇' : ['⊇', '⊇'], # superset or equal
|
||||||
|
u'⊕' : ['⊕', '⊕'], # circled plus
|
||||||
|
u'⊗' : ['⊗', '⊗'], # cirled times
|
||||||
|
u'⊥' : ['⊥', '⊥'], # perpendicular
|
||||||
|
u'⋅' : ['⋅', '⋅'], # dot operator
|
||||||
|
# Greek Letters
|
||||||
|
u'Α' : ['Α', 'Α'], # Alpha
|
||||||
|
u'Β' : ['Β', 'Β'], # Beta
|
||||||
|
u'Γ' : ['Γ', 'Γ'], # Gamma
|
||||||
|
u'Δ' : ['Δ', 'Δ'], # Delta
|
||||||
|
u'Ε' : ['Ε', 'Ε'], # Epsilon
|
||||||
|
u'Ζ' : ['Ζ', 'Ζ'], # Zeta
|
||||||
|
u'Η' : ['Η', 'Η'], # Eta
|
||||||
|
u'Θ' : ['Θ', 'Θ'], # Theta
|
||||||
|
u'Ι' : ['Ι', 'Ι'], # Iota
|
||||||
|
u'Κ' : ['Κ', 'Κ'], # Kappa
|
||||||
|
u'Λ' : ['Λ', 'Λ'], # Lambda
|
||||||
|
u'Μ' : ['Μ', 'Μ'], # Mu
|
||||||
|
u'Ν' : ['Ν', 'Ν'], # Nu
|
||||||
|
u'Ξ' : ['Ξ', 'Ξ'], # Xi
|
||||||
|
u'Ο' : ['Ο', 'Ο'], # Omicron
|
||||||
|
u'Π' : ['Π', 'Π'], # Pi
|
||||||
|
u'Ρ' : ['Ρ', 'Ρ'], # Rho
|
||||||
|
u'Σ' : ['Σ', 'Σ'], # Sigma
|
||||||
|
u'Τ' : ['Τ', 'Τ'], # Tau
|
||||||
|
u'Υ' : ['Υ', 'Υ'], # Upsilon
|
||||||
|
u'Φ' : ['Φ', 'Φ'], # Phi
|
||||||
|
u'Χ' : ['Χ', 'Χ'], # Chi
|
||||||
|
u'Ψ' : ['Ψ', 'Ψ'], # Psi
|
||||||
|
u'ω' : ['ω', 'ω'], # omega
|
||||||
|
u'ϑ' : ['ϑ', 'ϑ'], # theta symbol
|
||||||
|
u'ϒ' : ['ϒ', 'ϒ'], # upsilon symbol
|
||||||
|
u'ϖ' : ['ϖ', 'ϖ'], # pi symbol
|
||||||
|
# Other
|
||||||
|
u'Œ' : ['Œ', 'Œ'], # capital ligature OE
|
||||||
|
u'œ' : ['œ', 'œ'], # small ligature oe
|
||||||
|
u'Š' : ['Š', 'Š'], # capital S with caron
|
||||||
|
u'š' : ['š', 'š'], # small S with caron
|
||||||
|
u'Ÿ' : ['Ÿ', 'Ÿ'], # capital Y with diaeres
|
||||||
|
u'ƒ' : ['ƒ', 'ƒ'], # f with hook
|
||||||
|
u'ˆ' : ['ˆ', 'ˆ'], # modifier letter circumflex accent
|
||||||
|
u'˜' : ['˜', '˜'], # small tilde
|
||||||
|
u'–' : ['–', '–'], # en dash
|
||||||
|
u'—' : ['—', '—'], # em dash
|
||||||
|
u'‘' : ['‘', '‘'], # left single quotation mark
|
||||||
|
u'’' : ['’', '’'], # right single quotation mark
|
||||||
|
u'‚' : ['‚', '‚'], # single low-9 quotation mark
|
||||||
|
u'“' : ['“', '“'], # left double quotation mark
|
||||||
|
u'”' : ['”', '”'], # right double quotation mark
|
||||||
|
u'„' : ['„', '„'], # double low-9 quotation mark
|
||||||
|
u'†' : ['†', '†'], # dagger
|
||||||
|
u'‡' : ['‡', '‡'], # double dagger
|
||||||
|
u'•' : ['•', '•'], # bullet
|
||||||
|
u'…' : ['…', '…'], # horizontal ellipsis
|
||||||
|
u'‰' : ['‰', '‰'], # per mille
|
||||||
|
u'′' : ['′', '′'], # minutes
|
||||||
|
u'″' : ['″', '″'], # seconds
|
||||||
|
u'‹' : ['‹', '‹'], # single left angle quotation
|
||||||
|
u'›' : ['›', '›'], # single right angle quotation
|
||||||
|
u'‾' : ['‾', '‾'], # overline
|
||||||
|
u'€' : ['€', '€'], # euro
|
||||||
|
u'™' : ['™', '™'], # trademark
|
||||||
|
u'←' : ['←', '←'], # left arrow
|
||||||
|
u'↑' : ['↑', '↑'], # up arrow
|
||||||
|
u'→' : ['→', '→'], # right arrow
|
||||||
|
u'↓' : ['↓', '↓'], # down arrow
|
||||||
|
u'↔' : ['↔', '↔'], # left right arrow
|
||||||
|
u'↵' : ['↵', '↵'], # carriage return arrow
|
||||||
|
u'⌈' : ['⌈', '⌈'], # left ceiling
|
||||||
|
u'⌉' : ['⌉', '⌉'], # right ceiling
|
||||||
|
u'⌊' : ['⌊', '⌊'], # left floor
|
||||||
|
u'⌋' : ['⌋', '⌋'], # right floor
|
||||||
|
u'◊' : ['◊', '◊'], # lozenge
|
||||||
|
u'♠' : ['♠', '♠'], # spade
|
||||||
|
u'♣' : ['♣', '♣'], # club
|
||||||
|
u'♥' : ['♥', '♥'], # heart
|
||||||
|
u'♦' : ['♦', '♦'], # diamond
|
||||||
|
# Extra http://www.ascii.cl/htmlcodes.htm
|
||||||
|
u'<' : ['<', '<'], # less than sign
|
||||||
|
u'>' : ['>', '>'], # greater than sign
|
||||||
|
u'¡' : ['¡', '¡'], # inverted exclamation mark
|
||||||
|
u'¢' : ['¢', '¢'], # cent sign
|
||||||
|
u'£' : ['£', '£'], # pound sign
|
||||||
|
u'¤' : ['¤', '¤'], # currency sign
|
||||||
|
u'¥' : ['¥', '¥'], # yen sign
|
||||||
|
u'¦' : ['¦', '¦'], # broken vertical bar
|
||||||
|
u'§' : ['§', '§'], # section sign
|
||||||
|
u'¨' : ['¨', '¨'], # spacing diaeresis - umlaut
|
||||||
|
u'©' : ['©', '©'], # copyright sign
|
||||||
|
u'ª' : ['ª', 'ª'], # feminine ordinal indicator
|
||||||
|
u'«' : ['«', '«'], # left double angle quotes
|
||||||
|
u'¬' : ['¬', '¬'], # not sign
|
||||||
|
u'®' : ['®', '®'], # registered trade mark sign
|
||||||
|
u'¯' : ['¯', '¯'], # spacing macron - overline
|
||||||
|
u'°' : ['°', '°'], # degree sign
|
||||||
|
u'±' : ['±', '±'], # plus-or-minus sign
|
||||||
|
u'²' : ['²', '²'], # superscript two - squared
|
||||||
|
u'³' : ['³', '³'], # superscript three - cubed
|
||||||
|
u'´' : ['´', '´'], # acute accent - spacing acute
|
||||||
|
u'µ' : ['µ', 'µ'], # micro sign
|
||||||
|
u'¶' : ['¶', '¶'], # pilcrow sign - paragraph sign
|
||||||
|
u'·' : ['·', '·'], # middle dot - Georgian comma
|
||||||
|
u'¸' : ['¸', '¸'], # spacing cedilla
|
||||||
|
u'¹' : ['¹', '¹'], # superscript one
|
||||||
|
u'º' : ['º', 'º'], # masculine ordinal indicator
|
||||||
|
u'»' : ['»', '»'], # right double angle quotes
|
||||||
|
u'¼' : ['¼', '¼'], # fraction one quarter
|
||||||
|
u'½' : ['½', '½'], # fraction one half
|
||||||
|
u'¾' : ['¾', '¾'], # fraction three quarters
|
||||||
|
u'¿' : ['¿', '¿'], # inverted question mark
|
||||||
|
u'À' : ['À', 'À'], # latin capital letter A with grave
|
||||||
|
u'Á' : ['Á', 'Á'], # latin capital letter A with acute
|
||||||
|
u'Â' : ['Â', 'Â'], # latin capital letter A with circumflex
|
||||||
|
u'Ã' : ['Ã', 'Ã'], # latin capital letter A with tilde
|
||||||
|
u'Ä' : ['Ä', 'Ä'], # latin capital letter A with diaeresis
|
||||||
|
u'Å' : ['Å', 'Å'], # latin capital letter A with ring above
|
||||||
|
u'Æ' : ['Æ', 'Æ'], # latin capital letter AE
|
||||||
|
u'Ç' : ['Ç', 'Ç'], # latin capital letter C with cedilla
|
||||||
|
u'È' : ['È', 'È'], # latin capital letter E with grave
|
||||||
|
u'É' : ['É', 'É'], # latin capital letter E with acute
|
||||||
|
u'Ê' : ['Ê', 'Ê'], # latin capital letter E with circumflex
|
||||||
|
u'Ë' : ['Ë', 'Ë'], # latin capital letter E with diaeresis
|
||||||
|
u'Ì' : ['Ì', 'Ì'], # latin capital letter I with grave
|
||||||
|
u'Í' : ['Í', 'Í'], # latin capital letter I with acute
|
||||||
|
u'Î' : ['Î', 'Î'], # latin capital letter I with circumflex
|
||||||
|
u'Ï' : ['Ï', 'Ï'], # latin capital letter I with diaeresis
|
||||||
|
u'Ð' : ['Ð', 'Ð'], # latin capital letter ETH
|
||||||
|
u'Ñ' : ['Ñ', 'Ñ'], # latin capital letter N with tilde
|
||||||
|
u'Ò' : ['Ò', 'Ò'], # latin capital letter O with grave
|
||||||
|
u'Ó' : ['Ó', 'Ó'], # latin capital letter O with acute
|
||||||
|
u'Ô' : ['Ô', 'Ô'], # latin capital letter O with circumflex
|
||||||
|
u'Õ' : ['Õ', 'Õ'], # latin capital letter O with tilde
|
||||||
|
u'Ö' : ['Ö', 'Ö'], # latin capital letter O with diaeresis
|
||||||
|
u'×' : ['×', '×'], # multiplication sign
|
||||||
|
u'Ø' : ['Ø', 'Ø'], # latin capital letter O with slash
|
||||||
|
u'Ù' : ['Ù', 'Ù'], # latin capital letter U with grave
|
||||||
|
u'Ú' : ['Ú', 'Ú'], # latin capital letter U with acute
|
||||||
|
u'Û' : ['Û', 'Û'], # latin capital letter U with circumflex
|
||||||
|
u'Ü' : ['Ü', 'Ü'], # latin capital letter U with diaeresis
|
||||||
|
u'Ý' : ['Ý', 'Ý'], # latin capital letter Y with acute
|
||||||
|
u'Þ' : ['Þ', 'Þ'], # latin capital letter THORN
|
||||||
|
u'ß' : ['ß', 'ß'], # latin small letter sharp s - ess-zed
|
||||||
|
u'à' : ['à', 'à'], # latin small letter a with grave
|
||||||
|
u'á' : ['á', 'á'], # latin small letter a with acute
|
||||||
|
u'â' : ['â', 'â'], # latin small letter a with circumflex
|
||||||
|
u'ã' : ['ã', 'ã'], # latin small letter a with tilde
|
||||||
|
u'ä' : ['ä', 'ä'], # latin small letter a with diaeresis
|
||||||
|
u'å' : ['å', 'å'], # latin small letter a with ring above
|
||||||
|
u'æ' : ['æ', 'æ'], # latin small letter ae
|
||||||
|
u'ç' : ['ç', 'ç'], # latin small letter c with cedilla
|
||||||
|
u'è' : ['è', 'è'], # latin small letter e with grave
|
||||||
|
u'é' : ['é', 'é'], # latin small letter e with acute
|
||||||
|
u'ê' : ['ê', 'ê'], # latin small letter e with circumflex
|
||||||
|
u'ë' : ['ë', 'ë'], # latin small letter e with diaeresis
|
||||||
|
u'ì' : ['ì', 'ì'], # latin small letter i with grave
|
||||||
|
u'í' : ['í', 'í'], # latin small letter i with acute
|
||||||
|
u'î' : ['î', 'î'], # latin small letter i with circumflex
|
||||||
|
u'ï' : ['ï', 'ï'], # latin small letter i with diaeresis
|
||||||
|
u'ð' : ['ð', 'ð'], # latin small letter eth
|
||||||
|
u'ñ' : ['ñ', 'ñ'], # latin small letter n with tilde
|
||||||
|
u'ò' : ['ò', 'ò'], # latin small letter o with grave
|
||||||
|
u'ó' : ['ó', 'ó'], # latin small letter o with acute
|
||||||
|
u'ô' : ['ô', 'ô'], # latin small letter o with circumflex
|
||||||
|
u'õ' : ['õ', 'õ'], # latin small letter o with tilde
|
||||||
|
u'ö' : ['ö', 'ö'], # latin small letter o with diaeresis
|
||||||
|
u'÷' : ['÷', '÷'], # division sign
|
||||||
|
u'ø' : ['ø', 'ø'], # latin small letter o with slash
|
||||||
|
u'ù' : ['ù', 'ù'], # latin small letter u with grave
|
||||||
|
u'ú' : ['ú', 'ú'], # latin small letter u with acute
|
||||||
|
u'û' : ['û', 'û'], # latin small letter u with circumflex
|
||||||
|
u'ü' : ['ü', 'ü'], # latin small letter u with diaeresis
|
||||||
|
u'ý' : ['ý', 'ý'], # latin small letter y with acute
|
||||||
|
u'þ' : ['þ', 'þ'], # latin small letter thorn
|
||||||
|
u'ÿ' : ['ÿ', 'ÿ'], # latin small letter y with diaeresis
|
||||||
|
}
|
||||||
|
|
9
src/calibre/ebooks/txt/__init__.py
Normal file
9
src/calibre/ebooks/txt/__init__.py
Normal file
@ -0,0 +1,9 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
__license__ = 'GPL v3'
|
||||||
|
__copyright__ = '2008, John Schember john@nachtimwald.com'
|
||||||
|
__docformat__ = 'restructuredtext en'
|
||||||
|
|
||||||
|
'''
|
||||||
|
Used for txt output
|
||||||
|
'''
|
||||||
|
|
74
src/calibre/ebooks/txt/from_any.py
Normal file
74
src/calibre/ebooks/txt/from_any.py
Normal file
@ -0,0 +1,74 @@
|
|||||||
|
'''
|
||||||
|
Convert any ebook format to TXT.
|
||||||
|
'''
|
||||||
|
|
||||||
|
from __future__ import with_statement
|
||||||
|
|
||||||
|
__license__ = 'GPL v3'
|
||||||
|
__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net ' \
|
||||||
|
'and Marshall T. Vandegrift <llasram@gmail.com>' \
|
||||||
|
'and John Schember <john@nachtimwald.com>'
|
||||||
|
__docformat__ = 'restructuredtext en'
|
||||||
|
|
||||||
|
import sys, os, glob, logging
|
||||||
|
|
||||||
|
from calibre.ebooks.epub.from_any import any2epub, formats, USAGE
|
||||||
|
from calibre.ebooks.epub import config as common_config
|
||||||
|
from calibre.ptempfile import TemporaryDirectory
|
||||||
|
from calibre.ebooks.txt.writer import oeb2txt, config as txt_config
|
||||||
|
|
||||||
|
def config(defaults=None):
|
||||||
|
c = common_config(defaults=defaults, name='txt')
|
||||||
|
c.remove_opt('profile')
|
||||||
|
del c.option_set.groups['metadata']
|
||||||
|
del c.option_set.groups['traversal']
|
||||||
|
del c.option_set.groups['structure detection']
|
||||||
|
del c.option_set.groups['toc']
|
||||||
|
del c.option_set.groups['page layout']
|
||||||
|
txtc = txt_config(defaults=defaults)
|
||||||
|
c.update(txtc)
|
||||||
|
return c
|
||||||
|
|
||||||
|
def option_parser(usage=USAGE):
|
||||||
|
usage = usage % ('TXT', formats())
|
||||||
|
parser = config().option_parser(usage=usage)
|
||||||
|
return parser
|
||||||
|
|
||||||
|
def any2txt(opts, path, notification=None):
|
||||||
|
ext = os.path.splitext(path)[1]
|
||||||
|
if not ext:
|
||||||
|
raise ValueError('Unknown file type: '+path)
|
||||||
|
ext = ext.lower()[1:]
|
||||||
|
|
||||||
|
if opts.output is None:
|
||||||
|
opts.output = os.path.splitext(os.path.basename(path))[0]+'.txt'
|
||||||
|
|
||||||
|
opts.output = os.path.abspath(opts.output)
|
||||||
|
orig_output = opts.output
|
||||||
|
|
||||||
|
with TemporaryDirectory('_any2txt') as tdir:
|
||||||
|
oebdir = os.path.join(tdir, 'oeb')
|
||||||
|
os.mkdir(oebdir)
|
||||||
|
opts.output = os.path.join(tdir, 'dummy.epub')
|
||||||
|
opts.profile = 'None'
|
||||||
|
opts.dont_split_on_page_breaks = True
|
||||||
|
orig_bfs = opts.base_font_size2
|
||||||
|
opts.base_font_size2 = 0
|
||||||
|
any2epub(opts, path, create_epub=False, oeb_cover=False, extract_to=oebdir)
|
||||||
|
opts.base_font_size2 = orig_bfs
|
||||||
|
opf = glob.glob(os.path.join(oebdir, '*.opf'))[0]
|
||||||
|
opts.output = orig_output
|
||||||
|
logging.getLogger('html2epub').info(_('Creating TXT file from EPUB...'))
|
||||||
|
oeb2txt(opts, opf)
|
||||||
|
|
||||||
|
def main(args=sys.argv):
|
||||||
|
parser = option_parser()
|
||||||
|
opts, args = parser.parse_args(args)
|
||||||
|
if len(args) < 2:
|
||||||
|
parser.print_help()
|
||||||
|
print 'No input file specified.'
|
||||||
|
return 1
|
||||||
|
any2txt(opts, args[1])
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
sys.exit(main())
|
237
src/calibre/ebooks/txt/writer.py
Normal file
237
src/calibre/ebooks/txt/writer.py
Normal file
@ -0,0 +1,237 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
'''
|
||||||
|
Write content to TXT.
|
||||||
|
'''
|
||||||
|
from __future__ import with_statement
|
||||||
|
|
||||||
|
__license__ = 'GPL v3'
|
||||||
|
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
|
||||||
|
|
||||||
|
import os, logging, re, sys
|
||||||
|
|
||||||
|
from BeautifulSoup import BeautifulSoup
|
||||||
|
|
||||||
|
from calibre import LoggingInterface
|
||||||
|
from calibre.ebooks.htmlsymbols import HTML_SYMBOLS
|
||||||
|
from calibre.ebooks.epub.iterator import SpineItem
|
||||||
|
from calibre.ebooks.metadata import authors_to_string
|
||||||
|
from calibre.ebooks.metadata.meta import metadata_from_formats
|
||||||
|
from calibre.ebooks.metadata.opf2 import OPF
|
||||||
|
from calibre.customize.ui import run_plugins_on_postprocess
|
||||||
|
from calibre.utils.config import Config, StringConfig
|
||||||
|
|
||||||
|
class TXTWriter(object):
|
||||||
|
def __init__(self, newline):
|
||||||
|
self.newline = newline
|
||||||
|
|
||||||
|
def dump(self, oebpath, path, metadata):
|
||||||
|
opf = OPF(oebpath, os.path.dirname(oebpath))
|
||||||
|
spine = [SpineItem(i.path) for i in opf.spine]
|
||||||
|
|
||||||
|
tmpout = ''
|
||||||
|
for item in spine:
|
||||||
|
with open(item, 'r') as itemf:
|
||||||
|
content = itemf.read().decode(item.encoding)
|
||||||
|
# Convert newlines to unix style \n for processing. These
|
||||||
|
# will be changed to the specified type later in the process.
|
||||||
|
content = self.unix_newlines(content)
|
||||||
|
content = self.strip_html(content)
|
||||||
|
content = self.replace_html_symbols(content)
|
||||||
|
content = self.cleanup_text(content)
|
||||||
|
content = self.specified_newlines(content)
|
||||||
|
tmpout = tmpout + content
|
||||||
|
|
||||||
|
# Prepend metadata
|
||||||
|
if metadata.author != None and metadata.author != '':
|
||||||
|
tmpout = (u'%s%s%s%s' % (metadata.author.upper(), self.newline, self.newline, self.newline)) + tmpout
|
||||||
|
if metadata.title != None and metadata.title != '':
|
||||||
|
tmpout = (u'%s%s%s%s' % (metadata.title.upper(), self.newline, self.newline, self.newline)) + tmpout
|
||||||
|
|
||||||
|
# Put two blank lines at end of file
|
||||||
|
|
||||||
|
end = tmpout[-3 * len(self.newline):]
|
||||||
|
for i in range(3 - end.count(self.newline)):
|
||||||
|
tmpout = tmpout + self.newline
|
||||||
|
|
||||||
|
os.remove(path)
|
||||||
|
with open(path, 'w+b') as out:
|
||||||
|
out.write(tmpout.encode('utf-8'))
|
||||||
|
|
||||||
|
def strip_html(self, html):
|
||||||
|
stripped = u''
|
||||||
|
|
||||||
|
for dom_tree in BeautifulSoup(html).findAll('body'):
|
||||||
|
text = unicode(dom_tree)
|
||||||
|
|
||||||
|
# Remove unnecessary tags
|
||||||
|
for tag in ['script', 'style']:
|
||||||
|
text = re.sub('(?imu)<[ ]*%s[ ]*.*?>(.*)</[ ]*%s[ ]*>' % (tag, tag), '', text)
|
||||||
|
text = re.sub('<!--.*-->', '', text)
|
||||||
|
|
||||||
|
# Headings usually indicate Chapters.
|
||||||
|
# We are going to use a marker to insert the proper number of
|
||||||
|
# newline characters at the end of cleanup_text because cleanup_text
|
||||||
|
# remove excessive (more than 2 newlines).
|
||||||
|
for tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
|
||||||
|
text = re.sub('(?imu)<[ ]*%s[ ]*.*?>' % tag, '-vzxedxy-', text)
|
||||||
|
text = re.sub('(?imu)</[ ]*%s[ ]*>' % tag, '-vlgzxey-', text)
|
||||||
|
|
||||||
|
# Separate content with space.
|
||||||
|
for tag in ['td']:
|
||||||
|
text = re.sub('(?imu)</[ ]*%s[ ]*>', ' ', text)
|
||||||
|
|
||||||
|
# Separate content with empty line.
|
||||||
|
for tag in ['p', 'div', 'pre', 'li', 'table', 'tr']:
|
||||||
|
text = re.sub('(?imu)</[ ]*%s[ ]*>' % tag, '\n\n', text)
|
||||||
|
|
||||||
|
for tag in ['hr', 'br']:
|
||||||
|
text = re.sub('(?imu)<[ ]*%s[ ]*/*?>' % tag, '\n\n', text)
|
||||||
|
|
||||||
|
# Remove any tags that do not need special processing.
|
||||||
|
text = re.sub('<.*?>', '', text)
|
||||||
|
|
||||||
|
stripped = stripped + text
|
||||||
|
|
||||||
|
return stripped
|
||||||
|
|
||||||
|
def replace_html_symbols(self, content):
|
||||||
|
for symbol in HTML_SYMBOLS:
|
||||||
|
for code in HTML_SYMBOLS[symbol]:
|
||||||
|
content = content.replace(code, symbol)
|
||||||
|
return content
|
||||||
|
|
||||||
|
def cleanup_text(self, text):
|
||||||
|
# Replace bad characters.
|
||||||
|
text = text.replace(u'\xc2', '')
|
||||||
|
text = text.replace(u'\xa0', ' ')
|
||||||
|
|
||||||
|
# Replace tabs, vertical tags and form feeds with single space.
|
||||||
|
#text = re.sub('\xc2\xa0', '', text)
|
||||||
|
text = text.replace('\t+', ' ')
|
||||||
|
text = text.replace('\v+', ' ')
|
||||||
|
text = text.replace('\f+', ' ')
|
||||||
|
|
||||||
|
# Single line paragraph.
|
||||||
|
r = re.compile('.\n.')
|
||||||
|
while True:
|
||||||
|
mo = r.search(text)
|
||||||
|
if mo == None:
|
||||||
|
break
|
||||||
|
text = '%s %s' % (text[:mo.start()+1], text[mo.end()-1:])
|
||||||
|
|
||||||
|
# Remove multiple spaces.
|
||||||
|
text = re.sub('[ ]+', ' ', text)
|
||||||
|
text = re.sub('(?imu)^[ ]+', '', text)
|
||||||
|
text = re.sub('(?imu)[ ]+$', '', text)
|
||||||
|
|
||||||
|
# Remove excessive newlines.
|
||||||
|
text = re.sub('\n[ ]+\n', '\n\n', text)
|
||||||
|
text = re.sub('\n{3,}', '\n\n', text)
|
||||||
|
|
||||||
|
# Replace markers with the proper characters.
|
||||||
|
text = text.replace('-vzxedxy-', '\n\n\n\n\n')
|
||||||
|
text = text.replace('-vlgzxey-', '\n\n\n')
|
||||||
|
|
||||||
|
return text
|
||||||
|
|
||||||
|
def unix_newlines(self, text):
|
||||||
|
text = text.replace('\r\n', '\n')
|
||||||
|
text = text.replace('\r', '\n')
|
||||||
|
|
||||||
|
return text
|
||||||
|
|
||||||
|
def specified_newlines(self, text):
|
||||||
|
if self.newline == '\n':
|
||||||
|
return text
|
||||||
|
|
||||||
|
return text.replace('\n', self.newline)
|
||||||
|
|
||||||
|
class TxtMetadata(object):
|
||||||
|
def __init__(self):
|
||||||
|
self.author = None
|
||||||
|
self.title = None
|
||||||
|
self.series = None
|
||||||
|
|
||||||
|
|
||||||
|
class TxtNewlines(object):
|
||||||
|
NEWLINE_TYPES = {
|
||||||
|
'system' : os.linesep,
|
||||||
|
'unix' : '\n',
|
||||||
|
'old_mac' : '\r',
|
||||||
|
'windows' : '\r\n'
|
||||||
|
}
|
||||||
|
|
||||||
|
def __init__(self, newline_type):
|
||||||
|
self.newline = self.NEWLINE_TYPES.get(newline_type.lower(), os.linesep)
|
||||||
|
|
||||||
|
|
||||||
|
def config(defaults=None):
|
||||||
|
desc = _('Options to control the conversion to TXT')
|
||||||
|
if defaults is None:
|
||||||
|
c = Config('txt', desc)
|
||||||
|
else:
|
||||||
|
c = StringConfig(defaults, desc)
|
||||||
|
|
||||||
|
txt = c.add_group('TXT', _('TXT options.'))
|
||||||
|
|
||||||
|
txt('newline', ['--newline'], default='system',
|
||||||
|
help=_('Type of newline to use. Options are %s. Default is \'system\'. '
|
||||||
|
'Use \'old_mac\' for compatibility with Mac OS 9 and earlier. '
|
||||||
|
'For Mac OS X use \'unix\'. \'system\' will default to the newline '
|
||||||
|
'type used by this OS.' % sorted(TxtNewlines.NEWLINE_TYPES.keys())))
|
||||||
|
txt('prepend_author', ['--prepend-author'], default='true',
|
||||||
|
help=_('Write the author to the beginning of the file. '
|
||||||
|
'Default is \'true\'. Use \'false\' to disable.'))
|
||||||
|
txt('prepend_title', ['--prepend-title'], default='true',
|
||||||
|
help=_('Write the title to the beginning of the file. '
|
||||||
|
'Default is \'true\'. Use \'false\' to disable.'))
|
||||||
|
|
||||||
|
return c
|
||||||
|
|
||||||
|
def option_parser():
|
||||||
|
c = config()
|
||||||
|
parser = c.option_parser(usage='%prog '+_('[options]')+' file.opf')
|
||||||
|
parser.add_option(
|
||||||
|
'-o', '--output', default=None,
|
||||||
|
help=_('Output file. Default is derived from input filename.'))
|
||||||
|
parser.add_option(
|
||||||
|
'-v', '--verbose', default=0, action='count',
|
||||||
|
help=_('Useful for debugging.'))
|
||||||
|
return parser
|
||||||
|
|
||||||
|
def oeb2txt(opts, inpath):
|
||||||
|
logger = LoggingInterface(logging.getLogger('oeb2txt'))
|
||||||
|
logger.setup_cli_handler(opts.verbose)
|
||||||
|
|
||||||
|
outpath = opts.output
|
||||||
|
if outpath is None:
|
||||||
|
outpath = os.path.basename(inpath)
|
||||||
|
outpath = os.path.splitext(outpath)[0] + '.txt'
|
||||||
|
|
||||||
|
mi = metadata_from_formats([inpath])
|
||||||
|
metadata = TxtMetadata()
|
||||||
|
if opts.prepend_author.lower() == 'true':
|
||||||
|
metadata.author = opts.authors if opts.authors else authors_to_string(mi.authors)
|
||||||
|
if opts.prepend_title.lower() == 'true':
|
||||||
|
metadata.title = opts.title if opts.title else mi.title
|
||||||
|
|
||||||
|
newline = TxtNewlines(opts.newline)
|
||||||
|
|
||||||
|
writer = TXTWriter(newline.newline)
|
||||||
|
writer.dump(inpath, outpath, metadata)
|
||||||
|
run_plugins_on_postprocess(outpath, 'txt')
|
||||||
|
logger.log_info(_('Output written to ') + outpath)
|
||||||
|
|
||||||
|
def main(argv=sys.argv):
|
||||||
|
parser = option_parser()
|
||||||
|
opts, args = parser.parse_args(argv[1:])
|
||||||
|
if len(args) != 1:
|
||||||
|
parser.print_help()
|
||||||
|
return 1
|
||||||
|
inpath = args[0]
|
||||||
|
retval = oeb2txt(opts, inpath)
|
||||||
|
return retval
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
sys.exit(main())
|
||||||
|
|
Loading…
x
Reference in New Issue
Block a user