mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-31 14:33:54 -04:00
Working recursive link follower. HTML parsing is now of release worthy quality.
This commit is contained in:
parent
22872ee668
commit
e70abecc03
@ -17,8 +17,27 @@ This package contains logic to read and write LRF files. The LRF file format is
|
|||||||
At the time fo writing, this package only supports reading and writing LRF meat information. See L{meta}.
|
At the time fo writing, this package only supports reading and writing LRF meat information. See L{meta}.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
from optparse import OptionParser
|
||||||
|
|
||||||
|
from libprs500.lrf.pylrs.pylrs import Book as _Book
|
||||||
|
|
||||||
__docformat__ = "epytext"
|
__docformat__ = "epytext"
|
||||||
__author__ = "Kovid Goyal <kovid@kovidgoyal.net>"
|
__author__ = "Kovid Goyal <kovid@kovidgoyal.net>"
|
||||||
|
|
||||||
class ConversionError(Exception):
|
class ConversionError(Exception):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
def option_parser(usage):
|
||||||
|
parser = OptionParser(usage=usage)
|
||||||
|
parser.add_option("-t", "--title", action="store", type="string", \
|
||||||
|
dest="title", help="Set the title")
|
||||||
|
parser.add_option("-a", "--author", action="store", type="string", \
|
||||||
|
dest="author", help="Set the author", default='Unknown')
|
||||||
|
parser.add_option('-o', '--output', action='store', default=None, \
|
||||||
|
help='Output file name. Default is derived from input filename')
|
||||||
|
return parser
|
||||||
|
|
||||||
|
def Book(font_delta=0, **settings):
|
||||||
|
return _Book(textstyledefault=dict(fontsize=100+font_delta*20), \
|
||||||
|
pagestyledefault=dict(textwidth=570, textheight=747), \
|
||||||
|
**settings)
|
@ -14,26 +14,32 @@
|
|||||||
## You should have received a copy of the GNU General Public License along
|
## You should have received a copy of the GNU General Public License along
|
||||||
## with this program; if not, write to the Free Software Foundation, Inc.,
|
## with this program; if not, write to the Free Software Foundation, Inc.,
|
||||||
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
Code to convert HTML ebooks into LRF ebooks.
|
Code to convert HTML ebooks into LRF ebooks.
|
||||||
"""
|
"""
|
||||||
import os, re, sys
|
import os, re, sys
|
||||||
from htmlentitydefs import name2codepoint
|
from htmlentitydefs import name2codepoint
|
||||||
from optparse import OptionParser
|
|
||||||
from urllib import urlopen
|
from urllib import urlopen
|
||||||
|
from urlparse import urlparse
|
||||||
|
|
||||||
from libprs500.lrf.html.BeautifulSoup import BeautifulSoup, Comment, Tag, NavigableString
|
from libprs500.lrf.html.BeautifulSoup import BeautifulSoup, Comment, Tag, NavigableString
|
||||||
from libprs500.lrf.pylrs.pylrs import Book, Page, Paragraph, TextBlock, \
|
from libprs500.lrf.pylrs.pylrs import Paragraph, CR, Italic, ImageStream, TextBlock, \
|
||||||
CR, Italic, ImageStream, ImageBlock
|
ImageBlock, JumpButton, CharButton, Page
|
||||||
from libprs500.lrf.pylrs.pylrs import Span as _Span
|
from libprs500.lrf.pylrs.pylrs import Span as _Span
|
||||||
from libprs500.lrf import ConversionError
|
from libprs500.lrf import ConversionError, option_parser, Book
|
||||||
|
|
||||||
|
def ImagePage():
|
||||||
|
return Page(evensidemargin=0, oddsidemargin=0, topmargin=0, \
|
||||||
|
textwidth=600, textheight=800)
|
||||||
|
|
||||||
class Span(_Span):
|
class Span(_Span):
|
||||||
replaced_entities = [ 'amp', 'lt', 'gt' , 'ldquo', 'rdquo', 'lsquo', 'rsquo', 'nbsp' ]
|
replaced_entities = [ 'amp', 'lt', 'gt' , 'ldquo', 'rdquo', 'lsquo', 'rsquo', 'nbsp' ]
|
||||||
patterns = [ re.compile('&'+i+';') for i in replaced_entities ]
|
patterns = [ re.compile('&'+i+';') for i in replaced_entities ]
|
||||||
targets = [ unichr(name2codepoint[i]) for i in replaced_entities ]
|
targets = [ unichr(name2codepoint[i]) for i in replaced_entities ]
|
||||||
rules = zip(patterns, targets)
|
rules = zip(patterns, targets)
|
||||||
|
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def unit_convert(val, ref=80):
|
def unit_convert(val, ref=80):
|
||||||
"""
|
"""
|
||||||
@ -69,7 +75,7 @@ class Span(_Span):
|
|||||||
return result
|
return result
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def translate_attrs(d):
|
def translate_attrs(d, font_delta=0):
|
||||||
"""
|
"""
|
||||||
Receives a dictionary of html attributes and styles and returns
|
Receives a dictionary of html attributes and styles and returns
|
||||||
approximate Xylog equivalents in a new dictionary
|
approximate Xylog equivalents in a new dictionary
|
||||||
@ -108,6 +114,9 @@ class Span(_Span):
|
|||||||
t["fontsize"] = "120"
|
t["fontsize"] = "120"
|
||||||
else:
|
else:
|
||||||
t["fontsize"] = "100"
|
t["fontsize"] = "100"
|
||||||
|
fnsz = int(t['fontsize'])
|
||||||
|
fnsz += font_delta * 20
|
||||||
|
t['fontsize'] = str(fnsz)
|
||||||
elif key == "font-weight":
|
elif key == "font-weight":
|
||||||
m = re.match ("\s*([0-9]+)", val)
|
m = re.match ("\s*([0-9]+)", val)
|
||||||
if m is not None:
|
if m is not None:
|
||||||
@ -155,25 +164,30 @@ class Span(_Span):
|
|||||||
t[key] = d[key]
|
t[key] = d[key]
|
||||||
return t
|
return t
|
||||||
|
|
||||||
def __init__(self, ns, css):
|
def __init__(self, ns, css, font_delta=0):
|
||||||
src = ns.string
|
src = ns.string
|
||||||
src = re.sub('[\n\r]+', '', src)
|
src = re.sub(r'\s{2,}', ' ', src) # Remove multiple spaces
|
||||||
for pat, repl in Span.rules:
|
for pat, repl in Span.rules:
|
||||||
src = pat.sub(repl, src)
|
src = pat.sub(repl, src)
|
||||||
if not src:
|
if not src:
|
||||||
raise ConversionError('No point in adding an empty string')
|
raise ConversionError('No point in adding an empty string to a Span')
|
||||||
if 'font-style' in css.keys():
|
if 'font-style' in css.keys():
|
||||||
fs = css.pop('font-style')
|
fs = css.pop('font-style')
|
||||||
if fs.lower() == 'italic':
|
if fs.lower() == 'italic':
|
||||||
src = Italic(src)
|
src = Italic(src)
|
||||||
attrs = Span.translate_attrs(css)
|
attrs = Span.translate_attrs(css, font_delta=font_delta)
|
||||||
_Span.__init__(self, text=src, **attrs)
|
_Span.__init__(self, text=src, **attrs)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class HTMLConvertor(object):
|
class HTMLConverter(object):
|
||||||
selector_pat = re.compile(r"([A-Za-z0-9\-\_\:\.]+[A-Za-z0-9\-\_\:\.\s\,]*)\s*\{([^\}]*)\}")
|
selector_pat = re.compile(r"([A-Za-z0-9\-\_\:\.]+[A-Za-z0-9\-\_\:\.\s\,]*)\s*\{([^\}]*)\}")
|
||||||
# Defaults for various formatting tags
|
# Defaults for various formatting tags
|
||||||
|
class Link(object):
|
||||||
|
def __init__(self, para, tag):
|
||||||
|
self.para = para
|
||||||
|
self.tag = tag
|
||||||
|
|
||||||
css = dict(
|
css = dict(
|
||||||
h1 = {"font-size":"xx-large", "font-weight":"bold"},
|
h1 = {"font-size":"xx-large", "font-weight":"bold"},
|
||||||
h2 = {"font-size":"x-large", "font-weight":"bold"},
|
h2 = {"font-size":"x-large", "font-weight":"bold"},
|
||||||
@ -184,16 +198,35 @@ class HTMLConvertor(object):
|
|||||||
strong = {"font-weight":"bold"},
|
strong = {"font-weight":"bold"},
|
||||||
i = {"font-style":"italic"},
|
i = {"font-style":"italic"},
|
||||||
em = {"font-style":"italic"},
|
em = {"font-style":"italic"},
|
||||||
|
small = {'font-size':'small'}
|
||||||
)
|
)
|
||||||
|
processed_files = {} #: Files that have been processed
|
||||||
|
|
||||||
def __init__(self, book, soup, verbose=False):
|
def __init__(self, book, path, font_delta=0, verbose=False):
|
||||||
self.book = book #: The Book object representing a BBeB book
|
self.images = {} #: Images referenced in the HTML document
|
||||||
self.soup = soup #: Parsed HTML soup
|
self.targets = {} #: <a name=...> elements
|
||||||
|
self.links = [] #: <a href=...> elements
|
||||||
|
self.files = {} #: links that point to other files
|
||||||
|
self.links_processed = False #: Whether links_processed has been called on this object
|
||||||
|
self.font_delta = font_delta
|
||||||
|
self.book = book #: The Book object representing a BBeB book
|
||||||
|
path = os.path.abspath(path)
|
||||||
|
os.chdir(os.path.dirname(path))
|
||||||
|
self.file_name = os.path.basename(path)
|
||||||
|
print "Processing", self.file_name
|
||||||
|
print '\tParsing HTML...',
|
||||||
|
sys.stdout.flush()
|
||||||
|
self.soup = BeautifulSoup(open(self.file_name, 'r').read(), \
|
||||||
|
convertEntities=BeautifulSoup.HTML_ENTITIES)
|
||||||
|
print 'done\n\tConverting to BBeB...',
|
||||||
|
sys.stdout.flush()
|
||||||
self.verbose = verbose
|
self.verbose = verbose
|
||||||
self.current_page = None
|
self.current_page = None
|
||||||
self.current_para = None
|
self.current_para = None
|
||||||
self.current_style = {}
|
self.current_style = {}
|
||||||
self.parse_file(self.soup.html)
|
self.parse_file(self.soup.html)
|
||||||
|
HTMLConverter.processed_files[path] = self
|
||||||
|
print 'done'
|
||||||
|
|
||||||
def parse_css(self, style):
|
def parse_css(self, style):
|
||||||
"""
|
"""
|
||||||
@ -203,7 +236,8 @@ class HTMLConvertor(object):
|
|||||||
selector name and the value is a dictionary of properties
|
selector name and the value is a dictionary of properties
|
||||||
"""
|
"""
|
||||||
sdict = dict()
|
sdict = dict()
|
||||||
for sel in re.findall(HTMLConvertor.selector_pat, style):
|
style = re.sub('/\*.*?\*/', '', style) # Remove /*...*/ comments
|
||||||
|
for sel in re.findall(HTMLConverter.selector_pat, style):
|
||||||
for key in sel[0].split(','):
|
for key in sel[0].split(','):
|
||||||
key = key.strip().lower()
|
key = key.strip().lower()
|
||||||
val = self.parse_style_properties(sel[1])
|
val = self.parse_style_properties(sel[1])
|
||||||
@ -261,6 +295,7 @@ class HTMLConvertor(object):
|
|||||||
self.book.append(self.current_page)
|
self.book.append(self.current_page)
|
||||||
self.current_page = Page()
|
self.current_page = Page()
|
||||||
self.current_block = TextBlock()
|
self.current_block = TextBlock()
|
||||||
|
self.top = self.current_block
|
||||||
self.current_para = Paragraph()
|
self.current_para = Paragraph()
|
||||||
self.parse_tag(html, {})
|
self.parse_tag(html, {})
|
||||||
if self.current_para:
|
if self.current_para:
|
||||||
@ -269,14 +304,90 @@ class HTMLConvertor(object):
|
|||||||
self.current_page.append(self.current_block)
|
self.current_page.append(self.current_block)
|
||||||
if self.current_page:
|
if self.current_page:
|
||||||
self.book.append(self.current_page)
|
self.book.append(self.current_page)
|
||||||
|
|
||||||
|
|
||||||
|
def get_text(self, tag):
|
||||||
|
css = self.tag_css(tag)
|
||||||
|
if css.has_key('display') and css['display'].lower() == 'none':
|
||||||
|
return ''
|
||||||
|
text = ''
|
||||||
|
for c in tag.contents:
|
||||||
|
if isinstance(c, NavigableString):
|
||||||
|
text += str(c)
|
||||||
|
elif isinstance(c, Comment):
|
||||||
|
return ''
|
||||||
|
elif isinstance(c, Tag):
|
||||||
|
text += self.get_text(c)
|
||||||
|
return text
|
||||||
|
|
||||||
|
def process_links(self):
|
||||||
|
cwd = os.getcwd()
|
||||||
|
for link in self.links:
|
||||||
|
purl = urlparse(link.tag['href'])
|
||||||
|
if purl[1]: # Not a local link
|
||||||
|
continue
|
||||||
|
path, fragment = purl[2], purl[5]
|
||||||
|
para, tag = link.para, link.tag
|
||||||
|
if not path or os.path.basename(path) == self.file_name:
|
||||||
|
if fragment in self.targets.keys():
|
||||||
|
tb = self.targets[fragment]
|
||||||
|
jb = JumpButton(tb)
|
||||||
|
self.book.append(jb)
|
||||||
|
cb = CharButton(jb, text=self.get_text(tag))
|
||||||
|
para.append(cb)
|
||||||
|
else:
|
||||||
|
if not os.access(path, os.R_OK):
|
||||||
|
if self.verbose:
|
||||||
|
print "Skipping", link
|
||||||
|
continue
|
||||||
|
path = os.path.abspath(path)
|
||||||
|
if not path in HTMLConverter.processed_files.keys():
|
||||||
|
try:
|
||||||
|
self.files[path] = HTMLConverter(self.book, path, \
|
||||||
|
font_delta=self.font_delta, verbose=self.verbose)
|
||||||
|
HTMLConverter.processed_files[path] = self.files[path]
|
||||||
|
except:
|
||||||
|
continue
|
||||||
|
finally:
|
||||||
|
os.chdir(cwd)
|
||||||
|
else:
|
||||||
|
self.files[path] = HTMLConverter.processed_files[path]
|
||||||
|
conv = self.files[path]
|
||||||
|
if fragment in conv.targets.keys():
|
||||||
|
tb = conv.targets[fragment]
|
||||||
|
else:
|
||||||
|
tb = conv.top
|
||||||
|
jb = JumpButton(tb)
|
||||||
|
self.book.append(jb)
|
||||||
|
cb = CharButton(jb, text=self.get_text(tag))
|
||||||
|
para.append(cb)
|
||||||
|
|
||||||
|
self.links_processed = True
|
||||||
|
|
||||||
|
for path in self.files.keys():
|
||||||
|
if self.files[path].links_processed:
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
os.chdir(os.path.dirname(path))
|
||||||
|
self.files[path].process_links()
|
||||||
|
finally:
|
||||||
|
os.chdir(cwd)
|
||||||
|
|
||||||
|
|
||||||
def end_page(self):
|
def end_page(self):
|
||||||
self.current_block.append(self.current_para)
|
"""
|
||||||
self.current_para = Paragraph()
|
End the current page, ensuring that any further content is displayed
|
||||||
self.current_page.append(self.current_block)
|
on a new page.
|
||||||
self.current_block = TextBlock()
|
"""
|
||||||
self.book.append(self.current_page)
|
if self.current_para.contents:
|
||||||
self.current_page = Page()
|
self.current_block.append(self.current_para)
|
||||||
|
if self.current_block.contents:
|
||||||
|
self.current_page.append(self.current_block)
|
||||||
|
if self.current_page.contents:
|
||||||
|
self.book.append(self.current_page)
|
||||||
|
self.current_para = Paragraph()
|
||||||
|
self.current_block = TextBlock()
|
||||||
|
self.current_page = Page()
|
||||||
|
|
||||||
|
|
||||||
def parse_tag(self, tag, parent_css):
|
def parse_tag(self, tag, parent_css):
|
||||||
@ -286,52 +397,86 @@ class HTMLConvertor(object):
|
|||||||
test = key.lower()
|
test = key.lower()
|
||||||
if test.startswith('margin') or 'indent' in test or \
|
if test.startswith('margin') or 'indent' in test or \
|
||||||
'padding' in test or 'border' in test or 'page-break' in test \
|
'padding' in test or 'border' in test or 'page-break' in test \
|
||||||
or test in ['color', 'display', 'text-decoration', 'letter-spacing']:
|
or test in ['color', 'display', 'text-decoration', \
|
||||||
|
'letter-spacing', 'text-autospace', 'text-transform']:
|
||||||
css.pop(key)
|
css.pop(key)
|
||||||
return css
|
return css
|
||||||
|
|
||||||
def add_text(tag, css):
|
def add_text(tag, css):
|
||||||
try:
|
try:
|
||||||
self.current_para.append(Span(tag, sanctify_css(css)))
|
self.current_para.append(Span(tag, sanctify_css(css), \
|
||||||
|
font_delta=self.font_delta))
|
||||||
except ConversionError, err:
|
except ConversionError, err:
|
||||||
if self.verbose:
|
if self.verbose:
|
||||||
print >>sys.stderr, err
|
print >>sys.stderr, err
|
||||||
|
|
||||||
|
def process_children(ptag, pcss):
|
||||||
|
""" Process the children of ptag """
|
||||||
def process_text_tag(tag, pcss):
|
for c in ptag.contents:
|
||||||
if 'page-break-before' in pcss.keys():
|
if isinstance(c, Comment):
|
||||||
if pcss['page-break-before'].lower() != 'avoid':
|
continue
|
||||||
self.end_page()
|
elif isinstance(c, Tag):
|
||||||
pcss.pop('page-break-before')
|
|
||||||
end_page = False
|
|
||||||
if 'page-break-after' in pcss.keys():
|
|
||||||
end_page = True
|
|
||||||
pcss.pop('page-break-after')
|
|
||||||
for c in tag.contents:
|
|
||||||
if isinstance(tag, NavigableString):
|
|
||||||
add_text(tag, pcss)
|
|
||||||
else:
|
|
||||||
self.parse_tag(c, pcss)
|
self.parse_tag(c, pcss)
|
||||||
|
elif isinstance(c, NavigableString):
|
||||||
|
add_text(c, pcss)
|
||||||
|
|
||||||
|
def process_text_tag(tag, tag_css):
|
||||||
|
if 'page-break-before' in tag_css.keys():
|
||||||
|
if tag_css['page-break-before'].lower() != 'avoid':
|
||||||
|
self.end_page()
|
||||||
|
tag_css.pop('page-break-before')
|
||||||
|
end_page = False
|
||||||
|
if 'page-break-after' in tag_css.keys():
|
||||||
|
end_page = True
|
||||||
|
tag_css.pop('page-break-after')
|
||||||
|
process_children(tag, tag_css)
|
||||||
if end_page:
|
if end_page:
|
||||||
self.end_page()
|
self.end_page()
|
||||||
|
|
||||||
|
def add_image_block(path):
|
||||||
|
if os.access(path, os.R_OK):
|
||||||
|
self.end_page()
|
||||||
|
page = ImagePage()
|
||||||
|
if not self.images.has_key(path):
|
||||||
|
self.images[path] = ImageBlock(ImageStream(path))
|
||||||
|
page.append(self.images[path])
|
||||||
|
self.book.append(page)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
tagname = tag.name.lower()
|
tagname = tag.name.lower()
|
||||||
except AttributeError:
|
except AttributeError:
|
||||||
add_text(tag, parent_css)
|
add_text(tag, parent_css)
|
||||||
return
|
return
|
||||||
if tagname in ["title", "script", "meta"]:
|
tag_css = self.tag_css(tag, parent_css=parent_css)
|
||||||
|
try: # Skip element if its display attribute is set to none
|
||||||
|
if tag_css['display'].lower() == 'none':
|
||||||
|
return
|
||||||
|
except KeyError:
|
||||||
pass
|
pass
|
||||||
|
if tagname in ["title", "script", "meta", 'del']:
|
||||||
|
pass
|
||||||
|
elif tagname == 'a':
|
||||||
|
if tag.has_key('name'):
|
||||||
|
self.current_block.append(self.current_para)
|
||||||
|
self.current_page.append(self.current_block)
|
||||||
|
tb = TextBlock()
|
||||||
|
self.current_block = tb
|
||||||
|
self.current_para = Paragraph()
|
||||||
|
self.targets[tag['name']] = tb
|
||||||
|
process_children(tag, tag_css)
|
||||||
|
elif tag.has_key('href'):
|
||||||
|
purl = urlparse(tag['href'])
|
||||||
|
path = purl[2]
|
||||||
|
if path and os.path.splitext(path)[1][1:].lower() in \
|
||||||
|
['png', 'jpg', 'bmp', 'jpeg']:
|
||||||
|
add_image_block(path)
|
||||||
|
else:
|
||||||
|
span = _Span()
|
||||||
|
self.current_para.append(span)
|
||||||
|
self.links.append(HTMLConverter.Link(span, tag))
|
||||||
elif tagname == 'img':
|
elif tagname == 'img':
|
||||||
if tag.has_key('src'):
|
if tag.has_key('src'):
|
||||||
if os.access(tag['src'], os.R_OK):
|
add_image_block(tag['src'])
|
||||||
self.current_block.append(self.current_para)
|
|
||||||
self.current_page.append(self.current_block)
|
|
||||||
ib = ImageBlock(ImageStream(tag['src']))
|
|
||||||
self.current_page.append(ib)
|
|
||||||
self.current_block = TextBlock()
|
|
||||||
self.current_para = Paragraph()
|
|
||||||
elif tagname in ['style', 'link']:
|
elif tagname in ['style', 'link']:
|
||||||
if tagname == 'style':
|
if tagname == 'style':
|
||||||
for c in tag.contents:
|
for c in tag.contents:
|
||||||
@ -344,11 +489,11 @@ class HTMLConvertor(object):
|
|||||||
f = urlopen(url)
|
f = urlopen(url)
|
||||||
else:
|
else:
|
||||||
f = open(url)
|
f = open(url)
|
||||||
self.css.update(f.read())
|
self.parse_css(f.read())
|
||||||
f.close()
|
f.close()
|
||||||
elif tagname == 'p':
|
elif tagname in ['p', 'div', 'ul', 'ol', 'li', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
|
||||||
css = self.tag_css(tag, parent_css=parent_css)
|
# TODO: Implement ol
|
||||||
indent = css.pop('text-indent', '')
|
indent = tag_css.pop('text-indent', '')
|
||||||
if indent:
|
if indent:
|
||||||
# TODO: If indent is different from current textblock's parindent
|
# TODO: If indent is different from current textblock's parindent
|
||||||
# start a new TextBlock
|
# start a new TextBlock
|
||||||
@ -356,10 +501,9 @@ class HTMLConvertor(object):
|
|||||||
self.current_para.CR() # Put a paragraph end
|
self.current_para.CR() # Put a paragraph end
|
||||||
self.current_block.append(self.current_para)
|
self.current_block.append(self.current_para)
|
||||||
self.current_para = Paragraph()
|
self.current_para = Paragraph()
|
||||||
process_text_tag(tag, css)
|
process_text_tag(tag, tag_css)
|
||||||
elif tagname in ['b', 'strong', 'i', 'em', 'span']:
|
elif tagname in ['b', 'strong', 'i', 'em', 'span']:
|
||||||
css = self.tag_css(tag, parent_css=parent_css)
|
process_text_tag(tag, tag_css)
|
||||||
process_text_tag(tag, css)
|
|
||||||
elif tagname == 'font':
|
elif tagname == 'font':
|
||||||
pass
|
pass
|
||||||
elif tagname == 'link':
|
elif tagname == 'link':
|
||||||
@ -369,55 +513,44 @@ class HTMLConvertor(object):
|
|||||||
elif tagname == 'br':
|
elif tagname == 'br':
|
||||||
self.current_para.append(CR())
|
self.current_para.append(CR())
|
||||||
elif tagname == 'hr':
|
elif tagname == 'hr':
|
||||||
self.current_page.append(self.current_para)
|
self.current_para.append(CR())
|
||||||
self.current_block.append(self.current_page)
|
# TODO: Horizontal line?
|
||||||
self.current_para = Paragraph()
|
else:
|
||||||
self.current_page = Page()
|
process_children(tag, tag_css)
|
||||||
else:
|
|
||||||
css = self.tag_css(tag, parent_css=parent_css)
|
|
||||||
for c in tag.contents:
|
|
||||||
if isinstance(c, Comment):
|
|
||||||
continue
|
|
||||||
elif isinstance(c, Tag):
|
|
||||||
self.parse_tag(c, css)
|
|
||||||
elif isinstance(c, NavigableString):
|
|
||||||
add_text(c, css)
|
|
||||||
|
|
||||||
def writeto(self, path):
|
def writeto(self, path, lrs=False):
|
||||||
if path.lower().endswith('lrs'):
|
self.book.renderLrs(path) if lrs else self.book.renderLrf(path)
|
||||||
self.book.renderLrs(path)
|
|
||||||
else:
|
|
||||||
self.book.renderLrf(path)
|
|
||||||
|
|
||||||
|
|
||||||
def process_file(path, options):
|
def process_file(path, options):
|
||||||
cwd = os.getcwd()
|
cwd = os.getcwd()
|
||||||
try:
|
try:
|
||||||
path = os.path.abspath(path)
|
path = os.path.abspath(path)
|
||||||
os.chdir(os.path.dirname(path))
|
book = Book(font_delta=options.font_delta, title=options.title, \
|
||||||
soup = BeautifulSoup(open(path, 'r').read(), \
|
author=options.author, sourceencoding='utf8',\
|
||||||
convertEntities=BeautifulSoup.HTML_ENTITIES)
|
)
|
||||||
book = Book(title=options.title, author=options.author, \
|
conv = HTMLConverter(book, path, font_delta=options.font_delta)
|
||||||
sourceencoding='utf8')
|
conv.process_links()
|
||||||
conv = HTMLConvertor(book, soup)
|
oname = options.output
|
||||||
name = os.path.splitext(os.path.basename(path))[0]+'.lrf'
|
if not oname:
|
||||||
os.chdir(cwd)
|
suffix = '.lrs' if options.lrs else '.lrf'
|
||||||
conv.writeto(name)
|
name = os.path.splitext(os.path.basename(path))[0] + suffix
|
||||||
|
oname = os.path.join(cwd,name)
|
||||||
|
conv.writeto(oname, lrs=options.lrs)
|
||||||
|
print 'Output written to', oname
|
||||||
finally:
|
finally:
|
||||||
os.chdir(cwd)
|
os.chdir(cwd)
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
""" CLI for html -> lrf conversions """
|
""" CLI for html -> lrf conversions """
|
||||||
parser = OptionParser(usage=\
|
parser = option_parser("""usage: %prog [options] mybook.[html|rar|zip]
|
||||||
"""usage: %prog [options] mybook.txt
|
|
||||||
|
%prog converts mybook.html to mybook.lrf""")
|
||||||
%prog converts mybook.txt to mybook.lrf
|
parser.add_option('--lrs', action='store_true', dest='lrs', \
|
||||||
"""\
|
help='Convert to LRS', default=False)
|
||||||
)
|
parser.add_option('--font-delta', action='store', type='int', default=0, \
|
||||||
parser.add_option("-t", "--title", action="store", type="string", \
|
help="""Increase the font size by 2 * font-delta pts.
|
||||||
dest="title", help="Set the title")
|
If font-delta is negative, the font size is decreased.""")
|
||||||
parser.add_option("-a", "--author", action="store", type="string", \
|
|
||||||
dest="author", help="Set the author", default='Unknown')
|
|
||||||
options, args = parser.parse_args()
|
options, args = parser.parse_args()
|
||||||
if len(args) != 1:
|
if len(args) != 1:
|
||||||
parser.print_help()
|
parser.print_help()
|
||||||
|
@ -16,22 +16,20 @@
|
|||||||
Convert .txt files to .lrf
|
Convert .txt files to .lrf
|
||||||
"""
|
"""
|
||||||
import os, sys
|
import os, sys
|
||||||
from optparse import OptionParser
|
|
||||||
|
|
||||||
from libprs500.lrf import ConversionError
|
from libprs500.lrf import ConversionError, option_parser
|
||||||
|
from libprs500.lrf import Book
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
""" CLI for txt -> lrf conversions """
|
""" CLI for txt -> lrf conversions """
|
||||||
parser = OptionParser(usage=\
|
parser = option_parser(\
|
||||||
"""usage: %prog [options] mybook.txt
|
"""usage: %prog [options] mybook.txt
|
||||||
|
|
||||||
%prog converts mybook.txt to mybook.lrf
|
%prog converts mybook.txt to mybook.lrf
|
||||||
"""\
|
"""\
|
||||||
)
|
)
|
||||||
parser.add_option("-t", "--title", action="store", type="string", \
|
|
||||||
dest="title", help="Set the title")
|
|
||||||
parser.add_option("-a", "--author", action="store", type="string", \
|
|
||||||
dest="author", help="Set the author", default='Unknown')
|
|
||||||
defenc = 'cp1252'
|
defenc = 'cp1252'
|
||||||
enchelp = 'Set the encoding used to decode ' + \
|
enchelp = 'Set the encoding used to decode ' + \
|
||||||
'the text in mybook.txt. Default encoding is ' + defenc
|
'the text in mybook.txt. Default encoding is ' + defenc
|
||||||
@ -59,7 +57,6 @@ def convert_txt(path, options):
|
|||||||
the text in C{path}.)
|
the text in C{path}.)
|
||||||
"""
|
"""
|
||||||
import fileinput
|
import fileinput
|
||||||
from libprs500.lrf.pylrs.pylrs import Book
|
|
||||||
book = Book(title=options.title, author=options.author, \
|
book = Book(title=options.title, author=options.author, \
|
||||||
sourceencoding=options.encoding)
|
sourceencoding=options.encoding)
|
||||||
buffer = ''
|
buffer = ''
|
||||||
@ -72,14 +69,16 @@ def convert_txt(path, options):
|
|||||||
block.Paragraph(buffer)
|
block.Paragraph(buffer)
|
||||||
buffer = ''
|
buffer = ''
|
||||||
basename = os.path.basename(path)
|
basename = os.path.basename(path)
|
||||||
name = os.path.splitext(basename)[0]+'.lrf'
|
oname = options.output
|
||||||
|
if not oname:
|
||||||
|
oname = os.path.splitext(basename)[0]+'.lrf'
|
||||||
try:
|
try:
|
||||||
book.renderLrf(name)
|
book.renderLrf(oname)
|
||||||
except UnicodeDecodeError:
|
except UnicodeDecodeError:
|
||||||
raise ConversionError(path + ' is not encoded in ' + \
|
raise ConversionError(path + ' is not encoded in ' + \
|
||||||
options.encoding +'. Specify the '+ \
|
options.encoding +'. Specify the '+ \
|
||||||
'correct encoding with the -e option.')
|
'correct encoding with the -e option.')
|
||||||
return os.path.abspath(name)
|
return os.path.abspath(oname)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
Loading…
x
Reference in New Issue
Block a user