Support CSS color attribute and support reading title,author from html0 files.

This commit is contained in:
Kovid Goyal 2007-10-06 05:41:57 +00:00
parent 6bed1e2372
commit 16d1518d19
3 changed files with 165 additions and 10 deletions

View File

@ -0,0 +1,125 @@
## Copyright (C) 2007 Kovid Goyal kovid@kovidgoyal.net
## This program is free software; you can redistribute it and/or modify
## it under the terms of the GNU General Public License as published by
## the Free Software Foundation; either version 2 of the License, or
## (at your option) any later version.
##
## This program is distributed in the hope that it will be useful,
## but WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
## GNU General Public License for more details.
##
## You should have received a copy of the GNU General Public License along
## with this program; if not, write to the Free Software Foundation, Inc.,
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
import re
NAME_MAP = {
u'aliceblue': u'#F0F8FF',
u'antiquewhite': u'#FAEBD7',
u'aqua': u'#00FFFF',
u'aquamarine': u'#7FFFD4',
u'azure': u'#F0FFFF',
u'beige': u'#F5F5DC',
u'bisque': u'#FFE4C4',
u'black': u'#000000',
u'blanchedalmond': u'#FFEBCD',
u'blue': u'#0000FF',
u'brown': u'#A52A2A',
u'burlywood': u'#DEB887',
u'cadetblue': u'#5F9EA0',
u'chartreuse': u'#7FFF00',
u'chocolate': u'#D2691E',
u'coral': u'#FF7F50',
u'crimson': u'#DC143C',
u'cyan': u'#00FFFF',
u'darkblue': u'#00008B',
u'darkgoldenrod': u'#B8860B',
u'darkgreen': u'#006400',
u'darkkhaki': u'#BDB76B',
u'darkmagenta': u'#8B008B',
u'darkolivegreen': u'#556B2F',
u'darkorange': u'#FF8C00',
u'darkorchid': u'#9932CC',
u'darkred': u'#8B0000',
u'darksalmon': u'#E9967A',
u'darkslateblue': u'#483D8B',
u'darkslategrey': u'#2F4F4F',
u'darkviolet': u'#9400D3',
u'deeppink': u'#FF1493',
u'dodgerblue': u'#1E90FF',
u'firebrick': u'#B22222',
u'floralwhite': u'#FFFAF0',
u'forestgreen': u'#228B22',
u'fuchsia': u'#FF00FF',
u'gainsboro': u'#DCDCDC',
u'ghostwhite': u'#F8F8FF',
u'gold': u'#FFD700',
u'goldenrod': u'#DAA520',
u'indianred ': u'#CD5C5C',
u'indigo ': u'#4B0082',
u'khaki': u'#F0E68C',
u'lavenderblush': u'#FFF0F5',
u'lawngreen': u'#7CFC00',
u'lightblue': u'#ADD8E6',
u'lightcoral': u'#F08080',
u'lightgoldenrodyellow': u'#FAFAD2',
u'lightgray': u'#D3D3D3',
u'lightgrey': u'#D3D3D3',
u'lightskyblue': u'#87CEFA',
u'lightslategrey': u'#778899',
u'lightsteelblue': u'#B0C4DE',
u'lime': u'#87CEFA',
u'linen': u'#FAF0E6',
u'magenta': u'#FF00FF',
u'maroon': u'#800000',
u'mediumaquamarine': u'#66CDAA',
u'mediumblue': u'#0000CD',
u'mediumorchid': u'#BA55D3',
u'mediumpurple': u'#9370D8',
u'mediumseagreen': u'#3CB371',
u'mediumslateblue': u'#7B68EE',
u'midnightblue': u'#191970',
u'moccasin': u'#FFE4B5',
u'navajowhite': u'#FFDEAD',
u'navy': u'#000080',
u'oldlace': u'#FDF5E6',
u'olive': u'#808000',
u'orange': u'#FFA500',
u'orangered': u'#FF4500',
u'orchid': u'#DA70D6',
u'paleturquoise': u'#AFEEEE',
u'papayawhip': u'#FFEFD5',
u'peachpuff': u'#FFDAB9',
u'powderblue': u'#B0E0E6',
u'rosybrown': u'#BC8F8F',
u'royalblue': u'#4169E1',
u'saddlebrown': u'#8B4513',
u'sandybrown': u'#8B4513',
u'seashell': u'#FFF5EE',
u'sienna': u'#A0522D',
u'silver': u'#C0C0C0',
u'skyblue': u'#87CEEB',
u'slategrey': u'#708090',
u'snow': u'#FFFAFA',
u'springgreen': u'#00FF7F',
u'violet': u'#EE82EE',
u'yellowgreen': u'#9ACD32'
}
hex_pat = re.compile('#(\d{2})(\d{2})(\d{2})')
rgb_pat = re.compile('rgb\(\s*(\d+)\s*,\s*(\d+)\s*,\s*(\d+)\s*\)', re.IGNORECASE)
def lrs_color(html_color):
hcol = html_color.lower()
match = hex_pat.search(hcol)
if match:
return '0x00'+match.group(1)+match.group(2)+match.group(3)
match = rgb_pat.search(hcol)
if match:
return '0x00'+hex(int(match.group(1)))[2:]+hex(int(match.group(2)))[2:]+hex(int(match.group(3)))[2:]
if hcol in NAME_MAP:
return NAME_MAP[hcol]
return '0x00000000'

View File

@ -45,6 +45,7 @@ from libprs500 import filename_to_utf8, setup_cli_handlers, __appname__
from libprs500.ptempfile import PersistentTemporaryFile from libprs500.ptempfile import PersistentTemporaryFile
from libprs500.ebooks.metadata.opf import OPFReader from libprs500.ebooks.metadata.opf import OPFReader
from libprs500.devices.interface import Device from libprs500.devices.interface import Device
from libprs500.ebooks.lrf.html.color_map import lrs_color
class HTMLConverter(object): class HTMLConverter(object):
SELECTOR_PAT = re.compile(r"([A-Za-z0-9\-\_\:\.]+[A-Za-z0-9\-\_\:\.\s\,]*)\s*\{([^\}]*)\}") SELECTOR_PAT = re.compile(r"([A-Za-z0-9\-\_\:\.]+[A-Za-z0-9\-\_\:\.\s\,]*)\s*\{([^\}]*)\}")
@ -96,14 +97,20 @@ class HTMLConverter(object):
# Fix Book Designer markup # Fix Book Designer markup
BOOK_DESIGNER = [ BOOK_DESIGNER = [
# Create header tags # Create header tags
(re.compile('<h2.*?id=BookTitle.*?>(.*?)</span>', re.IGNORECASE|re.DOTALL), (re.compile('<h2.*?id=BookTitle.*?(align=)*(?(1)(\w+))*.*?>(.*?)</h2>', re.IGNORECASE|re.DOTALL),
lambda match : '<h1 align="center">%s</h1>'%(match.group(1),)), lambda match : '<h1 id="BookTitle" align="%s">%s</h1>'%(match.group(2) if match.group(2) else 'center', match.group(3))),
(re.compile('<h2.*?id=BookAuthor.*?>(.*?)</span>', re.IGNORECASE|re.DOTALL), (re.compile('<h2.*?id=BookAuthor.*?(align=)*(?(1)(\w+))*.*?>(.*?)</h2>', re.IGNORECASE|re.DOTALL),
lambda match : '<h2 align="right">%s</h2>'%(match.group(1),)), lambda match : '<h2 id="BookAuthor" align="%s">%s</h2>'%(match.group(2) if match.group(2) else 'center', match.group(3))),
(re.compile('<span.*?id=title.*?>(.*?)</span>', re.IGNORECASE|re.DOTALL), (re.compile('<span.*?id=title.*?>(.*?)</span>', re.IGNORECASE|re.DOTALL),
lambda match : '<h2>%s</h2>'%(match.group(1),)), lambda match : '<h2>%s</h2>'%(match.group(1),)),
(re.compile('<span.*?id=subtitle.*?>(.*?)</span>', re.IGNORECASE|re.DOTALL), (re.compile('<span.*?id=subtitle.*?>(.*?)</span>', re.IGNORECASE|re.DOTALL),
lambda match : '<h3>%s</h3>'%(match.group(1),)), lambda match : '<h3>%s</h3>'%(match.group(1),)),
# Blank lines
(re.compile('<div.*?>(&nbsp;){4}</div>', re.IGNORECASE),
lambda match : '<p></p>'),
# HR
(re.compile('<hr>', re.IGNORECASE),
lambda match : '<span style="page-break-after:always"> </span>'),
] ]
def __hasattr__(self, attr): def __hasattr__(self, attr):
@ -196,6 +203,8 @@ class HTMLConverter(object):
'content':re.compile('Baen', re.IGNORECASE)})) 'content':re.compile('Baen', re.IGNORECASE)}))
def start_on_file(self, path, is_root=True, link_level=0): def start_on_file(self, path, is_root=True, link_level=0):
self.css = HTMLConverter.CSS.copy()
self.pseudo_css = {}
path = os.path.abspath(path) path = os.path.abspath(path)
os.chdir(os.path.dirname(path)) os.chdir(os.path.dirname(path))
self.file_name = os.path.basename(path) self.file_name = os.path.basename(path)
@ -210,6 +219,8 @@ class HTMLConverter(object):
if self.pdftohtml: if self.pdftohtml:
nmassage.extend(HTMLConverter.PDFTOHTML) nmassage.extend(HTMLConverter.PDFTOHTML)
#raw = unicode(raw, 'utf8', 'replace') #raw = unicode(raw, 'utf8', 'replace')
if self.book_designer:
nmassage.extend(HTMLConverter.BOOK_DESIGNER)
try: try:
soup = BeautifulSoup(raw, soup = BeautifulSoup(raw,
convertEntities=BeautifulSoup.HTML_ENTITIES, convertEntities=BeautifulSoup.HTML_ENTITIES,
@ -225,6 +236,13 @@ class HTMLConverter(object):
self.baen = True self.baen = True
self.logger.info('Baen file detected. Re-parsing...') self.logger.info('Baen file detected. Re-parsing...')
return self.start_on_file(path, is_root=is_root, link_level=link_level) return self.start_on_file(path, is_root=is_root, link_level=link_level)
if self.book_designer:
t = soup.find(id='BookTitle')
if t:
self.book.set_title(self.get_text(t))
a = soup.find(id='BookAuthor')
if a:
self.book.set_author(self.get_text(a))
self.logger.info('\tConverting to BBeB...') self.logger.info('\tConverting to BBeB...')
sys.stdout.flush() sys.stdout.flush()
self.current_page = None self.current_page = None
@ -234,8 +252,6 @@ class HTMLConverter(object):
match = self.PAGE_BREAK_PAT.search(unicode(soup)) match = self.PAGE_BREAK_PAT.search(unicode(soup))
if match and not re.match('avoid', match.group(1), re.IGNORECASE): if match and not re.match('avoid', match.group(1), re.IGNORECASE):
self.page_break_found = True self.page_break_found = True
self.css = HTMLConverter.CSS.copy()
self.pseudo_css = {}
self.target_prefix = path self.target_prefix = path
self.links[path] = [] self.links[path] = []
self.previous_text = '\n' self.previous_text = '\n'
@ -278,7 +294,7 @@ class HTMLConverter(object):
Parses a style attribute. The code within a CSS selector block or in Parses a style attribute. The code within a CSS selector block or in
the style attribute of an HTML element. the style attribute of an HTML element.
@return: A dictionary with one entry for each property where the key @return: A dictionary with one entry for each property where the key
is the property name and the value is the property value. is the property name and the value is the property value.
""" """
prop = dict() prop = dict()
for s in props.split(';'): for s in props.split(';'):
@ -301,7 +317,7 @@ class HTMLConverter(object):
# however we need to as we don't do alignment at a block level. # however we need to as we don't do alignment at a block level.
# float is removed by the process_alignment function. # float is removed by the process_alignment function.
if chk.startswith('font') or chk == 'text-align' or \ if chk.startswith('font') or chk == 'text-align' or \
chk == 'float' or chk == 'white-space': chk == 'float' or chk == 'white-space' or chk == 'color':
temp[key] = pcss[key] temp[key] = pcss[key]
prop.update(temp) prop.update(temp)
@ -656,7 +672,11 @@ class HTMLConverter(object):
unneeded.append(prop) unneeded.append(prop)
for prop in unneeded: for prop in unneeded:
fp.pop(prop) fp.pop(prop)
elem = Span(text=src, **fp) if (fp or force_span_use) else src attrs = {}
if 'color' in css:
attrs['textcolor'] = lrs_color(css['color'])
attrs.update(fp)
elem = Span(text=src, **attrs) if (attrs or force_span_use) else src
if css.has_key('text-decoration'): if css.has_key('text-decoration'):
dec = css['text-decoration'].lower() dec = css['text-decoration'].lower()
linepos = 'after' if dec == 'underline' else 'before' if dec == 'overline' else None linepos = 'after' if dec == 'underline' else 'before' if dec == 'overline' else None
@ -1372,6 +1392,8 @@ class HTMLConverter(object):
elif tagname == 'font': elif tagname == 'font':
if tag.has_key('face'): if tag.has_key('face'):
tag_css['font-family'] = tag['face'] tag_css['font-family'] = tag['face']
if tag.has_key('color'):
tag_css['color'] = tag['color']
self.process_children(tag, tag_css, tag_pseudo_css) self.process_children(tag, tag_css, tag_pseudo_css)
elif tagname in ['br']: elif tagname in ['br']:
self.line_break() self.line_break()

View File

@ -442,6 +442,14 @@ class Book(Delegator):
self.gc_count = 0 self.gc_count = 0
def set_title(self, title):
ot = self.delegates[0].delegates[0].delegates[0].title
self.delegates[0].delegates[0].delegates[0].title = (title, ot[1])
def set_author(self, author):
ot = self.delegates[0].delegates[0].delegates[0].author
self.delegates[0].delegates[0].delegates[0].author = (author, ot[1])
def create_text_style(self, **settings): def create_text_style(self, **settings):
ans = TextStyle(**self.defaultTextStyle.attrs.copy()) ans = TextStyle(**self.defaultTextStyle.attrs.copy())
ans.update(settings) ans.update(settings)