Working recursive link follower. HTML parsing is now of release worthy quality.

This commit is contained in:
Kovid Goyal 2007-04-20 03:06:30 +00:00
parent 22872ee668
commit e70abecc03
3 changed files with 253 additions and 102 deletions

View File

@ -17,8 +17,27 @@ This package contains logic to read and write LRF files. The LRF file format is
At the time fo writing, this package only supports reading and writing LRF meat information. See L{meta}. At the time fo writing, this package only supports reading and writing LRF meat information. See L{meta}.
""" """
from optparse import OptionParser
from libprs500.lrf.pylrs.pylrs import Book as _Book
__docformat__ = "epytext" __docformat__ = "epytext"
__author__ = "Kovid Goyal <kovid@kovidgoyal.net>" __author__ = "Kovid Goyal <kovid@kovidgoyal.net>"
class ConversionError(Exception): class ConversionError(Exception):
pass pass
def option_parser(usage):
parser = OptionParser(usage=usage)
parser.add_option("-t", "--title", action="store", type="string", \
dest="title", help="Set the title")
parser.add_option("-a", "--author", action="store", type="string", \
dest="author", help="Set the author", default='Unknown')
parser.add_option('-o', '--output', action='store', default=None, \
help='Output file name. Default is derived from input filename')
return parser
def Book(font_delta=0, **settings):
return _Book(textstyledefault=dict(fontsize=100+font_delta*20), \
pagestyledefault=dict(textwidth=570, textheight=747), \
**settings)

View File

@ -14,26 +14,32 @@
## You should have received a copy of the GNU General Public License along ## You should have received a copy of the GNU General Public License along
## with this program; if not, write to the Free Software Foundation, Inc., ## with this program; if not, write to the Free Software Foundation, Inc.,
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. ## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
""" """
Code to convert HTML ebooks into LRF ebooks. Code to convert HTML ebooks into LRF ebooks.
""" """
import os, re, sys import os, re, sys
from htmlentitydefs import name2codepoint from htmlentitydefs import name2codepoint
from optparse import OptionParser
from urllib import urlopen from urllib import urlopen
from urlparse import urlparse
from libprs500.lrf.html.BeautifulSoup import BeautifulSoup, Comment, Tag, NavigableString from libprs500.lrf.html.BeautifulSoup import BeautifulSoup, Comment, Tag, NavigableString
from libprs500.lrf.pylrs.pylrs import Book, Page, Paragraph, TextBlock, \ from libprs500.lrf.pylrs.pylrs import Paragraph, CR, Italic, ImageStream, TextBlock, \
CR, Italic, ImageStream, ImageBlock ImageBlock, JumpButton, CharButton, Page
from libprs500.lrf.pylrs.pylrs import Span as _Span from libprs500.lrf.pylrs.pylrs import Span as _Span
from libprs500.lrf import ConversionError from libprs500.lrf import ConversionError, option_parser, Book
def ImagePage():
return Page(evensidemargin=0, oddsidemargin=0, topmargin=0, \
textwidth=600, textheight=800)
class Span(_Span): class Span(_Span):
replaced_entities = [ 'amp', 'lt', 'gt' , 'ldquo', 'rdquo', 'lsquo', 'rsquo', 'nbsp' ] replaced_entities = [ 'amp', 'lt', 'gt' , 'ldquo', 'rdquo', 'lsquo', 'rsquo', 'nbsp' ]
patterns = [ re.compile('&'+i+';') for i in replaced_entities ] patterns = [ re.compile('&'+i+';') for i in replaced_entities ]
targets = [ unichr(name2codepoint[i]) for i in replaced_entities ] targets = [ unichr(name2codepoint[i]) for i in replaced_entities ]
rules = zip(patterns, targets) rules = zip(patterns, targets)
@staticmethod @staticmethod
def unit_convert(val, ref=80): def unit_convert(val, ref=80):
""" """
@ -69,7 +75,7 @@ class Span(_Span):
return result return result
@staticmethod @staticmethod
def translate_attrs(d): def translate_attrs(d, font_delta=0):
""" """
Receives a dictionary of html attributes and styles and returns Receives a dictionary of html attributes and styles and returns
approximate Xylog equivalents in a new dictionary approximate Xylog equivalents in a new dictionary
@ -108,6 +114,9 @@ class Span(_Span):
t["fontsize"] = "120" t["fontsize"] = "120"
else: else:
t["fontsize"] = "100" t["fontsize"] = "100"
fnsz = int(t['fontsize'])
fnsz += font_delta * 20
t['fontsize'] = str(fnsz)
elif key == "font-weight": elif key == "font-weight":
m = re.match ("\s*([0-9]+)", val) m = re.match ("\s*([0-9]+)", val)
if m is not None: if m is not None:
@ -155,25 +164,30 @@ class Span(_Span):
t[key] = d[key] t[key] = d[key]
return t return t
def __init__(self, ns, css): def __init__(self, ns, css, font_delta=0):
src = ns.string src = ns.string
src = re.sub('[\n\r]+', '', src) src = re.sub(r'\s{2,}', ' ', src) # Remove multiple spaces
for pat, repl in Span.rules: for pat, repl in Span.rules:
src = pat.sub(repl, src) src = pat.sub(repl, src)
if not src: if not src:
raise ConversionError('No point in adding an empty string') raise ConversionError('No point in adding an empty string to a Span')
if 'font-style' in css.keys(): if 'font-style' in css.keys():
fs = css.pop('font-style') fs = css.pop('font-style')
if fs.lower() == 'italic': if fs.lower() == 'italic':
src = Italic(src) src = Italic(src)
attrs = Span.translate_attrs(css) attrs = Span.translate_attrs(css, font_delta=font_delta)
_Span.__init__(self, text=src, **attrs) _Span.__init__(self, text=src, **attrs)
class HTMLConvertor(object): class HTMLConverter(object):
selector_pat = re.compile(r"([A-Za-z0-9\-\_\:\.]+[A-Za-z0-9\-\_\:\.\s\,]*)\s*\{([^\}]*)\}") selector_pat = re.compile(r"([A-Za-z0-9\-\_\:\.]+[A-Za-z0-9\-\_\:\.\s\,]*)\s*\{([^\}]*)\}")
# Defaults for various formatting tags # Defaults for various formatting tags
class Link(object):
def __init__(self, para, tag):
self.para = para
self.tag = tag
css = dict( css = dict(
h1 = {"font-size":"xx-large", "font-weight":"bold"}, h1 = {"font-size":"xx-large", "font-weight":"bold"},
h2 = {"font-size":"x-large", "font-weight":"bold"}, h2 = {"font-size":"x-large", "font-weight":"bold"},
@ -184,16 +198,35 @@ class HTMLConvertor(object):
strong = {"font-weight":"bold"}, strong = {"font-weight":"bold"},
i = {"font-style":"italic"}, i = {"font-style":"italic"},
em = {"font-style":"italic"}, em = {"font-style":"italic"},
small = {'font-size':'small'}
) )
processed_files = {} #: Files that have been processed
def __init__(self, book, soup, verbose=False): def __init__(self, book, path, font_delta=0, verbose=False):
self.book = book #: The Book object representing a BBeB book self.images = {} #: Images referenced in the HTML document
self.soup = soup #: Parsed HTML soup self.targets = {} #: <a name=...> elements
self.links = [] #: <a href=...> elements
self.files = {} #: links that point to other files
self.links_processed = False #: Whether links_processed has been called on this object
self.font_delta = font_delta
self.book = book #: The Book object representing a BBeB book
path = os.path.abspath(path)
os.chdir(os.path.dirname(path))
self.file_name = os.path.basename(path)
print "Processing", self.file_name
print '\tParsing HTML...',
sys.stdout.flush()
self.soup = BeautifulSoup(open(self.file_name, 'r').read(), \
convertEntities=BeautifulSoup.HTML_ENTITIES)
print 'done\n\tConverting to BBeB...',
sys.stdout.flush()
self.verbose = verbose self.verbose = verbose
self.current_page = None self.current_page = None
self.current_para = None self.current_para = None
self.current_style = {} self.current_style = {}
self.parse_file(self.soup.html) self.parse_file(self.soup.html)
HTMLConverter.processed_files[path] = self
print 'done'
def parse_css(self, style): def parse_css(self, style):
""" """
@ -203,7 +236,8 @@ class HTMLConvertor(object):
selector name and the value is a dictionary of properties selector name and the value is a dictionary of properties
""" """
sdict = dict() sdict = dict()
for sel in re.findall(HTMLConvertor.selector_pat, style): style = re.sub('/\*.*?\*/', '', style) # Remove /*...*/ comments
for sel in re.findall(HTMLConverter.selector_pat, style):
for key in sel[0].split(','): for key in sel[0].split(','):
key = key.strip().lower() key = key.strip().lower()
val = self.parse_style_properties(sel[1]) val = self.parse_style_properties(sel[1])
@ -261,6 +295,7 @@ class HTMLConvertor(object):
self.book.append(self.current_page) self.book.append(self.current_page)
self.current_page = Page() self.current_page = Page()
self.current_block = TextBlock() self.current_block = TextBlock()
self.top = self.current_block
self.current_para = Paragraph() self.current_para = Paragraph()
self.parse_tag(html, {}) self.parse_tag(html, {})
if self.current_para: if self.current_para:
@ -269,14 +304,90 @@ class HTMLConvertor(object):
self.current_page.append(self.current_block) self.current_page.append(self.current_block)
if self.current_page: if self.current_page:
self.book.append(self.current_page) self.book.append(self.current_page)
def get_text(self, tag):
css = self.tag_css(tag)
if css.has_key('display') and css['display'].lower() == 'none':
return ''
text = ''
for c in tag.contents:
if isinstance(c, NavigableString):
text += str(c)
elif isinstance(c, Comment):
return ''
elif isinstance(c, Tag):
text += self.get_text(c)
return text
def process_links(self):
cwd = os.getcwd()
for link in self.links:
purl = urlparse(link.tag['href'])
if purl[1]: # Not a local link
continue
path, fragment = purl[2], purl[5]
para, tag = link.para, link.tag
if not path or os.path.basename(path) == self.file_name:
if fragment in self.targets.keys():
tb = self.targets[fragment]
jb = JumpButton(tb)
self.book.append(jb)
cb = CharButton(jb, text=self.get_text(tag))
para.append(cb)
else:
if not os.access(path, os.R_OK):
if self.verbose:
print "Skipping", link
continue
path = os.path.abspath(path)
if not path in HTMLConverter.processed_files.keys():
try:
self.files[path] = HTMLConverter(self.book, path, \
font_delta=self.font_delta, verbose=self.verbose)
HTMLConverter.processed_files[path] = self.files[path]
except:
continue
finally:
os.chdir(cwd)
else:
self.files[path] = HTMLConverter.processed_files[path]
conv = self.files[path]
if fragment in conv.targets.keys():
tb = conv.targets[fragment]
else:
tb = conv.top
jb = JumpButton(tb)
self.book.append(jb)
cb = CharButton(jb, text=self.get_text(tag))
para.append(cb)
self.links_processed = True
for path in self.files.keys():
if self.files[path].links_processed:
continue
try:
os.chdir(os.path.dirname(path))
self.files[path].process_links()
finally:
os.chdir(cwd)
def end_page(self): def end_page(self):
self.current_block.append(self.current_para) """
self.current_para = Paragraph() End the current page, ensuring that any further content is displayed
self.current_page.append(self.current_block) on a new page.
self.current_block = TextBlock() """
self.book.append(self.current_page) if self.current_para.contents:
self.current_page = Page() self.current_block.append(self.current_para)
if self.current_block.contents:
self.current_page.append(self.current_block)
if self.current_page.contents:
self.book.append(self.current_page)
self.current_para = Paragraph()
self.current_block = TextBlock()
self.current_page = Page()
def parse_tag(self, tag, parent_css): def parse_tag(self, tag, parent_css):
@ -286,52 +397,86 @@ class HTMLConvertor(object):
test = key.lower() test = key.lower()
if test.startswith('margin') or 'indent' in test or \ if test.startswith('margin') or 'indent' in test or \
'padding' in test or 'border' in test or 'page-break' in test \ 'padding' in test or 'border' in test or 'page-break' in test \
or test in ['color', 'display', 'text-decoration', 'letter-spacing']: or test in ['color', 'display', 'text-decoration', \
'letter-spacing', 'text-autospace', 'text-transform']:
css.pop(key) css.pop(key)
return css return css
def add_text(tag, css): def add_text(tag, css):
try: try:
self.current_para.append(Span(tag, sanctify_css(css))) self.current_para.append(Span(tag, sanctify_css(css), \
font_delta=self.font_delta))
except ConversionError, err: except ConversionError, err:
if self.verbose: if self.verbose:
print >>sys.stderr, err print >>sys.stderr, err
def process_children(ptag, pcss):
""" Process the children of ptag """
def process_text_tag(tag, pcss): for c in ptag.contents:
if 'page-break-before' in pcss.keys(): if isinstance(c, Comment):
if pcss['page-break-before'].lower() != 'avoid': continue
self.end_page() elif isinstance(c, Tag):
pcss.pop('page-break-before')
end_page = False
if 'page-break-after' in pcss.keys():
end_page = True
pcss.pop('page-break-after')
for c in tag.contents:
if isinstance(tag, NavigableString):
add_text(tag, pcss)
else:
self.parse_tag(c, pcss) self.parse_tag(c, pcss)
elif isinstance(c, NavigableString):
add_text(c, pcss)
def process_text_tag(tag, tag_css):
if 'page-break-before' in tag_css.keys():
if tag_css['page-break-before'].lower() != 'avoid':
self.end_page()
tag_css.pop('page-break-before')
end_page = False
if 'page-break-after' in tag_css.keys():
end_page = True
tag_css.pop('page-break-after')
process_children(tag, tag_css)
if end_page: if end_page:
self.end_page() self.end_page()
def add_image_block(path):
if os.access(path, os.R_OK):
self.end_page()
page = ImagePage()
if not self.images.has_key(path):
self.images[path] = ImageBlock(ImageStream(path))
page.append(self.images[path])
self.book.append(page)
try: try:
tagname = tag.name.lower() tagname = tag.name.lower()
except AttributeError: except AttributeError:
add_text(tag, parent_css) add_text(tag, parent_css)
return return
if tagname in ["title", "script", "meta"]: tag_css = self.tag_css(tag, parent_css=parent_css)
try: # Skip element if its display attribute is set to none
if tag_css['display'].lower() == 'none':
return
except KeyError:
pass pass
if tagname in ["title", "script", "meta", 'del']:
pass
elif tagname == 'a':
if tag.has_key('name'):
self.current_block.append(self.current_para)
self.current_page.append(self.current_block)
tb = TextBlock()
self.current_block = tb
self.current_para = Paragraph()
self.targets[tag['name']] = tb
process_children(tag, tag_css)
elif tag.has_key('href'):
purl = urlparse(tag['href'])
path = purl[2]
if path and os.path.splitext(path)[1][1:].lower() in \
['png', 'jpg', 'bmp', 'jpeg']:
add_image_block(path)
else:
span = _Span()
self.current_para.append(span)
self.links.append(HTMLConverter.Link(span, tag))
elif tagname == 'img': elif tagname == 'img':
if tag.has_key('src'): if tag.has_key('src'):
if os.access(tag['src'], os.R_OK): add_image_block(tag['src'])
self.current_block.append(self.current_para)
self.current_page.append(self.current_block)
ib = ImageBlock(ImageStream(tag['src']))
self.current_page.append(ib)
self.current_block = TextBlock()
self.current_para = Paragraph()
elif tagname in ['style', 'link']: elif tagname in ['style', 'link']:
if tagname == 'style': if tagname == 'style':
for c in tag.contents: for c in tag.contents:
@ -344,11 +489,11 @@ class HTMLConvertor(object):
f = urlopen(url) f = urlopen(url)
else: else:
f = open(url) f = open(url)
self.css.update(f.read()) self.parse_css(f.read())
f.close() f.close()
elif tagname == 'p': elif tagname in ['p', 'div', 'ul', 'ol', 'li', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
css = self.tag_css(tag, parent_css=parent_css) # TODO: Implement ol
indent = css.pop('text-indent', '') indent = tag_css.pop('text-indent', '')
if indent: if indent:
# TODO: If indent is different from current textblock's parindent # TODO: If indent is different from current textblock's parindent
# start a new TextBlock # start a new TextBlock
@ -356,10 +501,9 @@ class HTMLConvertor(object):
self.current_para.CR() # Put a paragraph end self.current_para.CR() # Put a paragraph end
self.current_block.append(self.current_para) self.current_block.append(self.current_para)
self.current_para = Paragraph() self.current_para = Paragraph()
process_text_tag(tag, css) process_text_tag(tag, tag_css)
elif tagname in ['b', 'strong', 'i', 'em', 'span']: elif tagname in ['b', 'strong', 'i', 'em', 'span']:
css = self.tag_css(tag, parent_css=parent_css) process_text_tag(tag, tag_css)
process_text_tag(tag, css)
elif tagname == 'font': elif tagname == 'font':
pass pass
elif tagname == 'link': elif tagname == 'link':
@ -369,55 +513,44 @@ class HTMLConvertor(object):
elif tagname == 'br': elif tagname == 'br':
self.current_para.append(CR()) self.current_para.append(CR())
elif tagname == 'hr': elif tagname == 'hr':
self.current_page.append(self.current_para) self.current_para.append(CR())
self.current_block.append(self.current_page) # TODO: Horizontal line?
self.current_para = Paragraph() else:
self.current_page = Page() process_children(tag, tag_css)
else:
css = self.tag_css(tag, parent_css=parent_css)
for c in tag.contents:
if isinstance(c, Comment):
continue
elif isinstance(c, Tag):
self.parse_tag(c, css)
elif isinstance(c, NavigableString):
add_text(c, css)
def writeto(self, path): def writeto(self, path, lrs=False):
if path.lower().endswith('lrs'): self.book.renderLrs(path) if lrs else self.book.renderLrf(path)
self.book.renderLrs(path)
else:
self.book.renderLrf(path)
def process_file(path, options): def process_file(path, options):
cwd = os.getcwd() cwd = os.getcwd()
try: try:
path = os.path.abspath(path) path = os.path.abspath(path)
os.chdir(os.path.dirname(path)) book = Book(font_delta=options.font_delta, title=options.title, \
soup = BeautifulSoup(open(path, 'r').read(), \ author=options.author, sourceencoding='utf8',\
convertEntities=BeautifulSoup.HTML_ENTITIES) )
book = Book(title=options.title, author=options.author, \ conv = HTMLConverter(book, path, font_delta=options.font_delta)
sourceencoding='utf8') conv.process_links()
conv = HTMLConvertor(book, soup) oname = options.output
name = os.path.splitext(os.path.basename(path))[0]+'.lrf' if not oname:
os.chdir(cwd) suffix = '.lrs' if options.lrs else '.lrf'
conv.writeto(name) name = os.path.splitext(os.path.basename(path))[0] + suffix
oname = os.path.join(cwd,name)
conv.writeto(oname, lrs=options.lrs)
print 'Output written to', oname
finally: finally:
os.chdir(cwd) os.chdir(cwd)
def main(): def main():
""" CLI for html -> lrf conversions """ """ CLI for html -> lrf conversions """
parser = OptionParser(usage=\ parser = option_parser("""usage: %prog [options] mybook.[html|rar|zip]
"""usage: %prog [options] mybook.txt
%prog converts mybook.html to mybook.lrf""")
%prog converts mybook.txt to mybook.lrf parser.add_option('--lrs', action='store_true', dest='lrs', \
"""\ help='Convert to LRS', default=False)
) parser.add_option('--font-delta', action='store', type='int', default=0, \
parser.add_option("-t", "--title", action="store", type="string", \ help="""Increase the font size by 2 * font-delta pts.
dest="title", help="Set the title") If font-delta is negative, the font size is decreased.""")
parser.add_option("-a", "--author", action="store", type="string", \
dest="author", help="Set the author", default='Unknown')
options, args = parser.parse_args() options, args = parser.parse_args()
if len(args) != 1: if len(args) != 1:
parser.print_help() parser.print_help()

View File

@ -16,22 +16,20 @@
Convert .txt files to .lrf Convert .txt files to .lrf
""" """
import os, sys import os, sys
from optparse import OptionParser
from libprs500.lrf import ConversionError from libprs500.lrf import ConversionError, option_parser
from libprs500.lrf import Book
def main(): def main():
""" CLI for txt -> lrf conversions """ """ CLI for txt -> lrf conversions """
parser = OptionParser(usage=\ parser = option_parser(\
"""usage: %prog [options] mybook.txt """usage: %prog [options] mybook.txt
%prog converts mybook.txt to mybook.lrf %prog converts mybook.txt to mybook.lrf
"""\ """\
) )
parser.add_option("-t", "--title", action="store", type="string", \
dest="title", help="Set the title")
parser.add_option("-a", "--author", action="store", type="string", \
dest="author", help="Set the author", default='Unknown')
defenc = 'cp1252' defenc = 'cp1252'
enchelp = 'Set the encoding used to decode ' + \ enchelp = 'Set the encoding used to decode ' + \
'the text in mybook.txt. Default encoding is ' + defenc 'the text in mybook.txt. Default encoding is ' + defenc
@ -59,7 +57,6 @@ def convert_txt(path, options):
the text in C{path}.) the text in C{path}.)
""" """
import fileinput import fileinput
from libprs500.lrf.pylrs.pylrs import Book
book = Book(title=options.title, author=options.author, \ book = Book(title=options.title, author=options.author, \
sourceencoding=options.encoding) sourceencoding=options.encoding)
buffer = '' buffer = ''
@ -72,14 +69,16 @@ def convert_txt(path, options):
block.Paragraph(buffer) block.Paragraph(buffer)
buffer = '' buffer = ''
basename = os.path.basename(path) basename = os.path.basename(path)
name = os.path.splitext(basename)[0]+'.lrf' oname = options.output
if not oname:
oname = os.path.splitext(basename)[0]+'.lrf'
try: try:
book.renderLrf(name) book.renderLrf(oname)
except UnicodeDecodeError: except UnicodeDecodeError:
raise ConversionError(path + ' is not encoded in ' + \ raise ConversionError(path + ' is not encoded in ' + \
options.encoding +'. Specify the '+ \ options.encoding +'. Specify the '+ \
'correct encoding with the -e option.') 'correct encoding with the -e option.')
return os.path.abspath(name) return os.path.abspath(oname)
if __name__ == '__main__': if __name__ == '__main__':