From 7fefb01f35c0ae8859cbc9e36ada2fa450185d60 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 4 Oct 2008 13:02:38 -0700 Subject: [PATCH] Implemented font size control in EPUB conversion. You can now specify the base font size in absolute units. Remove spacing between paragraphs by default for EPUB output. Can be turned off. Added options for more sophisticated generation of an automatic Table of Contents in EPUB files. Restricted Scientific American recipe to only download articles in current issue. --- src/calibre/__init__.py | 2 +- src/calibre/ebooks/epub/__init__.py | 22 +- src/calibre/ebooks/epub/fonts.py | 300 ++++++++++++++++++ src/calibre/ebooks/epub/from_html.py | 68 ++-- src/calibre/ebooks/epub/split.py | 31 +- src/calibre/ebooks/html.py | 270 +++++++++++----- src/calibre/gui2/dialogs/epub.py | 11 + src/calibre/gui2/dialogs/epub.ui | 49 ++- src/calibre/linux.py | 5 + src/calibre/trac/plugins/download.py | 2 +- src/calibre/web/feeds/__init__.py | 78 ++++- src/calibre/web/feeds/news.py | 7 +- src/calibre/web/feeds/recipes/espn.py | 2 +- src/calibre/web/feeds/recipes/newsweek.py | 2 +- .../web/feeds/recipes/outlook_india.py | 2 +- .../web/feeds/recipes/scientific_american.py | 115 +++++-- src/calibre/web/fetch/simple.py | 2 +- 17 files changed, 788 insertions(+), 180 deletions(-) create mode 100644 src/calibre/ebooks/epub/fonts.py diff --git a/src/calibre/__init__.py b/src/calibre/__init__.py index 1f33e34483..61d7801cb4 100644 --- a/src/calibre/__init__.py +++ b/src/calibre/__init__.py @@ -284,7 +284,7 @@ def english_sort(x, y): class LoggingInterface: def __init__(self, logger): - self.__logger = logger + self.__logger = self.logger = logger def setup_cli_handler(self, verbosity): for handler in self.__logger.handlers: diff --git a/src/calibre/ebooks/epub/__init__.py b/src/calibre/ebooks/epub/__init__.py index bf151d840c..1d8b6e6b4c 100644 --- a/src/calibre/ebooks/epub/__init__.py +++ b/src/calibre/ebooks/epub/__init__.py @@ -7,7 +7,6 @@ __docformat__ = 'restructuredtext en' Conversion to EPUB. ''' import sys, textwrap -from lxml import html from calibre.utils.config import Config, StringConfig from calibre.utils.zipfile import ZipFile, ZIP_STORED from calibre.ebooks.html import config as common_config, tostring @@ -16,13 +15,11 @@ class DefaultProfile(object): flow_size = sys.maxint screen_size = None - dpi = 100 class PRS505(DefaultProfile): flow_size = 300000 screen_size = (600, 775) - dpi = 166 PROFILES = { @@ -30,6 +27,13 @@ PROFILES = { 'None' : DefaultProfile, } +def rules(stylesheets): + for s in stylesheets: + if hasattr(s, 'cssText'): + for r in s: + if r.type == r.STYLE_RULE: + yield r + def initialize_container(path_to_container, opf_name='metadata.opf'): ''' Create an empty EPUB document, with a default skeleton. @@ -95,6 +99,12 @@ to auto-generate a Table of Contents. help=_("Don't add auto-detected chapters to the Table of Contents.")) toc('toc_threshold', ['--toc-threshold'], default=6, help=_('If fewer than this number of chapters is detected, then links are added to the Table of Contents.')) + toc('level1_toc', ['--level1-toc'], default=None, + help=_('XPath expression that specifies all tags that should be added to the Table of Contents at level one. If this is specified, it takes precedence over other forms of auto-detection.')) + toc('level2_toc', ['--level2-toc'], default=None, + help=_('XPath expression that specifies all tags that should be added to the Table of Contents at level two. Each entry is added under the previous level one entry.')) + toc('from_ncx', ['--from-ncx'], default=None, + help=_('Path to a .ncx file that contains the table of contents to use for this ebook. The NCX file should contain links relative to the directory it is placed in. See http://www.niso.org/workrooms/daisy/Z39-86-2005.html#NCX for an overview of the NCX format.')) toc('use_auto_toc', ['--use-auto-toc'], default=False, help=_('Normally, if the source file already has a Table of Contents, it is used in preference to the autodetected one. With this option, the autodetected one is always used.')) @@ -107,8 +117,10 @@ to auto-generate a Table of Contents. help=_('Set the left margin in pts. Default is %default')) layout('margin_right', ['--margin-right'], default=5.0, help=_('Set the right margin in pts. Default is %default')) - layout('base_font_size', ['--base-font-size'], default=100.0, - help=_('The base font size as a percentage. Default is %default. Changing this should allow you to control overall base font sizes, except for input HTML files that use absolute font sizes for their text tags.')) + layout('base_font_size2', ['--base-font-size'], default=12.0, + help=_('The base font size in pts. Default is %defaultpt. Set to 0 to disable rescaling of fonts.')) + layout('remove_paragraph_spacing', ['--remove-paragraph-spacing'], default=True, + help=_('Remove spacing between paragraphs. Will not work if the source file forces inter-paragraph spacing.')) c.add_opt('show_opf', ['--show-opf'], default=False, group='debug', help=_('Print generated OPF file to stdout')) diff --git a/src/calibre/ebooks/epub/fonts.py b/src/calibre/ebooks/epub/fonts.py new file mode 100644 index 0000000000..5d0887f2d0 --- /dev/null +++ b/src/calibre/ebooks/epub/fonts.py @@ -0,0 +1,300 @@ +#!/usr/bin/env python +__license__ = 'GPL v3' +__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net' +__docformat__ = 'restructuredtext en' + +''' +Font size rationalization. See :function:`relativize`. +''' + +import logging, re, operator, functools, collections, unittest, copy, sys +from xml.dom import SyntaxErr + +from lxml.cssselect import CSSSelector +from lxml import etree +from lxml.html import HtmlElement + +from calibre.ebooks.html import fromstring +from calibre.ebooks.epub import rules +from cssutils import CSSParser + +num = r'[-]?\d+|[-]?\d*\.\d+' +length = r'(?P0)|(?P{num})(?P%|em|ex|px|in|cm|mm|pt|pc)'.replace('{num}', num) +absolute_size = r'(?P(x?x-)?(small|large)|medium)' +relative_size = r'(?Psmaller|larger)' + +font_size_pat = re.compile('|'.join((relative_size, absolute_size, length)), re.I) +line_height_pat = re.compile(r'({num})(px|in|cm|mm|pt|pc)'.replace('{num}', num)) + +PTU = { + 'in' : 72., + 'cm' : 72/2.54, + 'mm' : 72/25.4, + 'pt' : 1.0, + 'pc' : 1/12., + } + +DEFAULT_FONT_SIZE = 12 + +class Rationalizer(object): + + @classmethod + def specificity(cls, s): + '''Map CSS specificity tuple to a single integer''' + return sum([10**(4-i) + x for i,x in enumerate(s)]) + + @classmethod + def compute_font_size(cls, elem): + ''' + Calculate the effective font size of an element traversing its ancestors as far as + neccessary. + ''' + cfs = elem.computed_font_size + if cfs is not None: + return + sfs = elem.specified_font_size + if callable(sfs): + parent = elem.getparent() + cls.compute_font_size(parent) + elem.computed_font_size = sfs(parent.computed_font_size) + else: + elem.computed_font_size = sfs + + @classmethod + def calculate_font_size(cls, style): + 'Return font size in pts from style object. For relative units returns a callable' + match = font_size_pat.search(style.font) + fs = '' + if match: + fs = match.group() + if style.fontSize: + fs = style.fontSize + + match = font_size_pat.search(fs) + if match is None: + return None + match = match.groupdict() + unit = match.get('unit', '') + if unit: unit = unit.lower() + if unit in PTU.keys(): + return PTU[unit] * float(match['num']) + if unit in ('em', 'ex'): + return functools.partial(operator.mul, float(match['num'])) + if unit == '%': + return functools.partial(operator.mul, float(match['num'])/100.) + abs = match.get('abs', '') + if abs: abs = abs.lower() + if abs: + x = (1.2)**(abs.count('x') * (-1 if 'small' in abs else 1)) + return 12 * x + if match.get('zero', False): + return 0. + return functools.partial(operator.mul, 1.2) if 'larger' in fs.lower() else functools.partial(operator.mul, 0.8) + + @classmethod + def resolve_rules(cls, stylesheets): + for sheet in stylesheets: + if hasattr(sheet, 'fs_rules'): + continue + sheet.fs_rules = [] + sheet.lh_rules = [] + for r in sheet: + if r.type == r.STYLE_RULE: + font_size = cls.calculate_font_size(r.style) + if font_size is not None: + for s in r.selectorList: + sheet.fs_rules.append([CSSSelector(s.selectorText), font_size]) + orig = line_height_pat.search(r.style.lineHeight) + if orig is not None: + for s in r.selectorList: + sheet.lh_rules.append([CSSSelector(s.selectorText), float(orig.group(1)) * PTU[orig.group(2).lower()]]) + + + @classmethod + def apply_font_size_rules(cls, stylesheets, root): + 'Add a ``specified_font_size`` attribute to every element that has a specified font size' + cls.resolve_rules(stylesheets) + for sheet in stylesheets: + for selector, font_size in sheet.fs_rules: + elems = selector(root) + for elem in elems: + elem.specified_font_size = font_size + + @classmethod + def remove_font_size_information(cls, stylesheets): + for r in rules(stylesheets): + r.style.removeProperty('font-size') + try: + new = font_size_pat.sub('', r.style.font).strip() + if new: + r.style.font = new + else: + r.style.removeProperty('font') + except SyntaxErr: + r.style.removeProperty('font') + if line_height_pat.search(r.style.lineHeight) is not None: + r.style.removeProperty('line-height') + + @classmethod + def compute_font_sizes(cls, root, stylesheets, base=12): + stylesheets = [s for s in stylesheets if hasattr(s, 'cssText')] + cls.apply_font_size_rules(stylesheets, root) + + # Compute the effective font size of all tags + root.computed_font_size = DEFAULT_FONT_SIZE + for elem in root.iter(etree.Element): + cls.compute_font_size(elem) + + extra_css = {} + if base > 0: + # Calculate the "base" (i.e. most common) font size + font_sizes = collections.defaultdict(lambda : 0) + body = root.xpath('//body')[0] + IGNORE = ('h1', 'h2', 'h3', 'h4', 'h5', 'h6') + for elem in body.iter(etree.Element): + if elem.tag not in IGNORE: + t = getattr(elem, 'text', '') + if t: t = t.strip() + if t: + font_sizes[elem.computed_font_size] += len(t) + + t = getattr(elem, 'tail', '') + if t: t = t.strip() + if t: + parent = elem.getparent() + if parent.tag not in IGNORE: + font_sizes[parent.computed_font_size] += len(t) + + try: + most_common = max(font_sizes.items(), key=operator.itemgetter(1))[0] + scale = base/most_common if most_common > 0 else 1. + except ValueError: + scale = 1. + + # rescale absolute line-heights + counter = 0 + for sheet in stylesheets: + for selector, lh in sheet.lh_rules: + for elem in selector(root): + elem.set('id', elem.get('id', 'cfs_%d'%counter)) + counter += 1 + if not extra_css.has_key(elem.get('id')): + extra_css[elem.get('id')] = [] + extra_css[elem.get('id')].append('line-height:%fpt'%(lh*scale)) + + + + # Rescale all computed font sizes + for elem in body.iter(etree.Element): + if isinstance(elem, HtmlElement): + elem.computed_font_size *= scale + + # Remove all font size specifications from the last stylesheet + cls.remove_font_size_information(stylesheets[-1:]) + + # Create the CSS to implement the rescaled font sizes + for elem in body.iter(etree.Element): + cfs, pcfs = map(operator.attrgetter('computed_font_size'), (elem, elem.getparent())) + if abs(cfs-pcfs) > 1/12. and abs(pcfs) > 1/12.: + elem.set('id', elem.get('id', 'cfs_%d'%counter)) + counter += 1 + if not extra_css.has_key(elem.get('id')): + extra_css[elem.get('id')] = [] + extra_css[elem.get('id')].append('font-size: %f%%'%(100*(cfs/pcfs))) + + css = CSSParser(loglevel=logging.ERROR).parseString('') + for id, r in extra_css.items(): + css.add('#%s {%s}'%(id, ';'.join(r))) + return css + + @classmethod + def rationalize(cls, stylesheets, root, opts): + logger = logging.getLogger('html2epub') + logger.info('\t\tRationalizing fonts...') + extra_css = None + if opts.base_font_size2 > 0: + try: + extra_css = cls.compute_font_sizes(root, stylesheets, base=opts.base_font_size2) + except: + logger.warning('Failed to rationalize font sizes.') + if opts.verbose > 1: + logger.exception('') + finally: + root.remove_font_size_information() + logger.debug('\t\tDone rationalizing') + return extra_css + +################################################################################ +############## Testing +################################################################################ + +class FontTest(unittest.TestCase): + + def setUp(self): + from calibre.ebooks.epub import config + self.opts = config(defaults='').parse() + self.html = ''' + + + Test document + + +
+ +

Some text

+
+

Some other text.

+

The longest piece of single font size text in this entire file. Used to test resizing.

+ + + ''' + self.root = fromstring(self.html) + + def do_test(self, css, base=DEFAULT_FONT_SIZE, scale=1): + root1 = copy.deepcopy(self.root) + root1.computed_font_size = DEFAULT_FONT_SIZE + stylesheet = CSSParser(loglevel=logging.ERROR).parseString(css) + stylesheet2 = Rationalizer.compute_font_sizes(root1, [stylesheet], base) + root2 = copy.deepcopy(root1) + root2.remove_font_size_information() + root2.computed_font_size = DEFAULT_FONT_SIZE + Rationalizer.apply_font_size_rules([stylesheet2], root2) + for elem in root2.iter(etree.Element): + Rationalizer.compute_font_size(elem) + for e1, e2 in zip(root1.xpath('//body')[0].iter(etree.Element), root2.xpath('//body')[0].iter(etree.Element)): + self.assertAlmostEqual(e1.computed_font_size, e2.computed_font_size, + msg='Computed font sizes for %s not equal. Original: %f Processed: %f'%\ + (root1.getroottree().getpath(e1), e1.computed_font_size, e2.computed_font_size)) + return stylesheet2.cssText + + def testStripping(self): + 'Test that any original entries are removed from the CSS' + css = 'p { font: bold 10px italic smaller; font-size: x-large} \na { font-size: 0 }' + css = CSSParser(loglevel=logging.ERROR).parseString(css) + Rationalizer.compute_font_sizes(copy.deepcopy(self.root), [css]) + self.assertEqual(css.cssText.replace(' ', '').replace('\n', ''), + 'p{font:bolditalic}') + + def testIdentity(self): + 'Test that no unnecessary font size changes are made' + extra_css = self.do_test('div {font-size:12pt} \nspan {font-size:100%}') + self.assertEqual(extra_css.strip(), '') + + def testRelativization(self): + 'Test conversion of absolute to relative sizes' + self.do_test('#p1 {font: 24pt} b {font: 12pt} .it {font: 48pt} #p2 {font: 100%}') + + def testResizing(self): + 'Test resizing of fonts' + self.do_test('#longest {font: 24pt} .it {font:20pt; line-height:22pt}') + + +def suite(): + return unittest.TestLoader().loadTestsFromTestCase(FontTest) + +def test(): + unittest.TextTestRunner(verbosity=2).run(suite()) + +if __name__ == '__main__': + sys.exit(test()) + \ No newline at end of file diff --git a/src/calibre/ebooks/epub/from_html.py b/src/calibre/ebooks/epub/from_html.py index e5fe93ce27..54f91e9f0f 100644 --- a/src/calibre/ebooks/epub/from_html.py +++ b/src/calibre/ebooks/epub/from_html.py @@ -32,8 +32,7 @@ Conversion of HTML/OPF files follows several stages: * The EPUB container is created. ''' -import os, sys, re, cStringIO, logging -from contextlib import nested +import os, sys, cStringIO, logging from lxml.etree import XPath try: @@ -41,7 +40,7 @@ try: except ImportError: import Image as PILImage -from calibre.ebooks.html import Processor, get_text, merge_metadata, get_filelist,\ +from calibre.ebooks.html import Processor, merge_metadata, get_filelist,\ opf_traverse, create_metadata, rebase_toc from calibre.ebooks.epub import config as common_config from calibre.ptempfile import TemporaryDirectory @@ -50,21 +49,23 @@ from calibre.ebooks.metadata.toc import TOC from calibre.ebooks.metadata.opf2 import OPF from calibre.ebooks.epub import initialize_container, PROFILES from calibre.ebooks.epub.split import split +from calibre.ebooks.epub.fonts import Rationalizer from calibre.constants import preferred_encoding -class HTMLProcessor(Processor): +class HTMLProcessor(Processor, Rationalizer): - def __init__(self, htmlfile, opts, tdir, resource_map, htmlfiles): + def __init__(self, htmlfile, opts, tdir, resource_map, htmlfiles, stylesheets): Processor.__init__(self, htmlfile, opts, tdir, resource_map, htmlfiles, - name='html2epub') + name='html2epub') if opts.verbose > 2: self.debug_tree('parsed') self.detect_chapters() - - self.extract_css() - self.relativize_font_sizes() + self.extract_css(stylesheets) + if self.opts.base_font_size2 > 0: + self.font_css = self.rationalize(self.external_stylesheets+[self.stylesheet], + self.root, self.opts) if opts.verbose > 2: self.debug_tree('nocss') @@ -73,19 +74,6 @@ class HTMLProcessor(Processor): meta.getparent().remove(meta) Processor.save(self) - #self.collect_font_statistics() - - - def collect_font_statistics(self): - ''' - Collect font statistics to figure out the base font size used in this - HTML document. - ''' - self.font_statistics = {} #: A mapping of font size (in pts) to number of characters rendered at that font size - for text in get_text(self.body if self.body is not None else self.root): - length, parent = len(re.sub(r'\s+', '', text)), text.getparent() - #TODO: Use cssutils on self.raw_css to figure out the font size - # of this piece of text and update statistics accordingly @@ -104,21 +92,30 @@ the element of the OPF file. def parse_content(filelist, opts, tdir): os.makedirs(os.path.join(tdir, 'content', 'resources')) - resource_map = {} + resource_map, stylesheets = {}, {} toc = TOC(base_path=tdir, type='root') + stylesheet_map = {} for htmlfile in filelist: + logging.getLogger('html2epub').debug('Processing %s...'%htmlfile) hp = HTMLProcessor(htmlfile, opts, os.path.join(tdir, 'content'), - resource_map, filelist) + resource_map, filelist, stylesheets) hp.populate_toc(toc) hp.save() + stylesheet_map[os.path.basename(hp.save_path())] = \ + [s for s in hp.external_stylesheets + [hp.stylesheet, hp.font_css, hp.override_css] if s is not None] + logging.getLogger('html2epub').debug('Saving stylesheets...') + if opts.base_font_size2 > 0: + Rationalizer.remove_font_size_information(stylesheets.values()) + for path, css in stylesheets.items(): + open(path, 'wb').write(getattr(css, 'cssText', css).encode('utf-8')) if toc.count('chapter') > opts.toc_threshold: toc.purge(['file', 'link', 'unknown']) if toc.count('chapter') + toc.count('file') > opts.toc_threshold: toc.purge(['link', 'unknown']) toc.purge(['link'], max=opts.max_toc_links) - return resource_map, hp.htmlfile_map, toc + return resource_map, hp.htmlfile_map, toc, stylesheet_map def resize_cover(im, opts): width, height = im.size @@ -176,7 +173,7 @@ def process_title_page(mi, filelist, htmlfilemap, opts, tdir): Cover - +
cover
@@ -212,11 +209,22 @@ def convert(htmlfile, opts, notification=None): mi = merge_metadata(htmlfile, opf, opts) opts.chapter = XPath(opts.chapter, namespaces={'re':'http://exslt.org/regular-expressions'}) + if opts.level1_toc: + opts.level1_toc = XPath(opts.level1_toc, + namespaces={'re':'http://exslt.org/regular-expressions'}) + else: + opts.level1_toc = None + if opts.level2_toc: + opts.level2_toc = XPath(opts.level2_toc, + namespaces={'re':'http://exslt.org/regular-expressions'}) + else: + opts.level2_toc = None with TemporaryDirectory(suffix='_html2epub', keep=opts.keep_intermediate) as tdir: if opts.keep_intermediate: print 'Intermediate files in', tdir - resource_map, htmlfile_map, generated_toc = parse_content(filelist, opts, tdir) + resource_map, htmlfile_map, generated_toc, stylesheet_map = \ + parse_content(filelist, opts, tdir) logger = logging.getLogger('html2epub') resources = [os.path.join(tdir, 'content', f) for f in resource_map.values()] @@ -235,6 +243,10 @@ def convert(htmlfile, opts, notification=None): rebase_toc(mi.toc, htmlfile_map, tdir) if opts.use_auto_toc or mi.toc is None or len(list(mi.toc.flat())) < 2: mi.toc = generated_toc + if opts.from_ncx: + toc = TOC() + toc.read_ncx_toc(opts.from_ncx) + mi.toc = toc for item in mi.manifest: if getattr(item, 'mime_type', None) == 'text/html': item.mime_type = 'application/xhtml+xml' @@ -247,7 +259,7 @@ def convert(htmlfile, opts, notification=None): f.write(toc) if opts.show_ncx: print toc - split(opf_path, opts) + split(opf_path, opts, stylesheet_map) opf = OPF(opf_path, tdir) opf.remove_guide() if has_title_page: diff --git a/src/calibre/ebooks/epub/split.py b/src/calibre/ebooks/epub/split.py index 11df503dc4..30d3857941 100644 --- a/src/calibre/ebooks/epub/split.py +++ b/src/calibre/ebooks/epub/split.py @@ -12,10 +12,9 @@ import os, math, logging, functools, collections, re, copy from lxml.etree import XPath as _XPath from lxml import etree, html from lxml.cssselect import CSSSelector -from cssutils import CSSParser from calibre.ebooks.metadata.opf2 import OPF -from calibre.ebooks.epub import tostring +from calibre.ebooks.epub import tostring, rules from calibre import CurrentDir, LoggingInterface XPath = functools.partial(_XPath, namespaces={'re':'http://exslt.org/regular-expressions'}) @@ -35,7 +34,7 @@ class SplitError(ValueError): class Splitter(LoggingInterface): - def __init__(self, path, opts, always_remove=False): + def __init__(self, path, opts, stylesheet_map, always_remove=False): LoggingInterface.__init__(self, logging.getLogger('htmlsplit')) self.setup_cli_handler(opts.verbose) self.path = path @@ -46,22 +45,8 @@ class Splitter(LoggingInterface): self.log_info('\tSplitting %s (%d KB)', path, self.orig_size/1024.) root = html.fromstring(open(content(path)).read()) - css = XPath('//link[@type = "text/css" and @rel = "stylesheet"]')(root) - if css: - cssp = os.path.join('content', *(css[0].get('href').split('/'))) - self.log_debug('\t\tParsing stylesheet...') - try: - stylesheet = CSSParser().parseString(open(cssp, 'rb').read()) - except: - self.log_warn('Failed to parse CSS. Splitting on page-breaks is disabled') - if self.opts.verbose > 1: - self.log_exception('') - stylesheet = None - else: - stylesheet = None self.page_breaks = [] - if stylesheet is not None: - self.find_page_breaks(stylesheet, root) + self.find_page_breaks(stylesheet_map[self.path], root) self.trees = [] self.split_size = 0 @@ -189,14 +174,12 @@ class Splitter(LoggingInterface): self.split(t) - def find_page_breaks(self, stylesheet, root): + def find_page_breaks(self, stylesheets, root): ''' Find all elements that have either page-break-before or page-break-after set. ''' page_break_selectors = set([]) - for rule in stylesheet: - if rule.type != rule.STYLE_RULE: - continue + for rule in rules(stylesheets): before = getattr(rule.style.getPropertyCSSValue('page-break-before'), 'cssText', '').strip().lower() after = getattr(rule.style.getPropertyCSSValue('page-break-after'), 'cssText', '').strip().lower() try: @@ -385,7 +368,7 @@ def fix_ncx(path, changes): if changed: open(path, 'wb').write(etree.tostring(tree.getroot(), encoding='UTF-8', xml_declaration=True)) -def split(pathtoopf, opts): +def split(pathtoopf, opts, stylesheet_map): pathtoopf = os.path.abspath(pathtoopf) with CurrentDir(os.path.dirname(pathtoopf)): opf = OPF(open(pathtoopf, 'rb'), os.path.dirname(pathtoopf)) @@ -403,7 +386,7 @@ def split(pathtoopf, opts): for f in html_files: if os.stat(content(f)).st_size > opts.profile.flow_size: try: - changes.append(Splitter(f, opts, + changes.append(Splitter(f, opts, stylesheet_map, always_remove=(always_remove or \ os.stat(content(f)).st_size > 5*opts.profile.flow_size))) except (SplitError, RuntimeError): diff --git a/src/calibre/ebooks/html.py b/src/calibre/ebooks/html.py index e22f2bbc0e..e5e6f3f37a 100644 --- a/src/calibre/ebooks/html.py +++ b/src/calibre/ebooks/html.py @@ -8,12 +8,14 @@ Code to recursively parse HTML files and create an open ebook in a specified directory or zip file. All the action starts in :function:`create_dir`. ''' -import sys, re, os, shutil, logging, tempfile, cStringIO +import sys, re, os, shutil, logging, tempfile, cStringIO, operator, functools from urlparse import urlparse from urllib import unquote -from lxml import html, etree -from lxml.html import soupparser +from lxml import etree +from lxml.html import HtmlElementClassLookup, HTMLParser as _HTMLParser, \ + fromstring as _fromstring, tostring as _tostring, \ + soupparser, HtmlElement from lxml.etree import XPath get_text = XPath("//text()") @@ -25,9 +27,67 @@ from calibre.ebooks.metadata.meta import get_metadata from calibre.ebooks.metadata.opf2 import OPF, OPFCreator from calibre.ptempfile import PersistentTemporaryDirectory, PersistentTemporaryFile from calibre.utils.zipfile import ZipFile +from cssutils import CSSParser + +class HTMLElement(HtmlElement): + + @apply + def specified_font_size(): + + def fget(self): + ans = self.get('specified_font_size', '') + if not ans: + return lambda x: x + if ans.startswith('f'): + return functools.partial(operator.mul, float(ans[1:])) + return float(ans) + + def fset(self, val): + self.set('specified_font_size', ('f'+repr(val(1))) if callable(val) else repr(val)) + + return property(fget=fget, fset=fset) + + @apply + def computed_font_size(): + def fget(self): + ans = self.get('computed_font_size', '') + if ans == '': + return None + return float(ans) + + def fset(self, val): + self.set('computed_font_size', repr(val)) + + return property(fget=fget, fset=fset) + + def remove_font_size_information(self): + for elem in self.iter(): + for p in ('computed', 'specified'): + elem.attrib.pop(p+'_font_size', None) + + def getpath(self): + return self.getroottree().getpath(self) + +class Lookup(HtmlElementClassLookup): + + def lookup(self, node_type, document, namespace, name): + if node_type == 'element': + return HTMLElement + return HtmlElementClassLookup.lookup(self, node_type, document, namespace, name) + +class HTMLParser(_HTMLParser): + + def __init__(self, **kwargs): + super(HTMLParser, self).__init__(**kwargs) + self.set_element_class_lookup(Lookup()) + +parser = HTMLParser() + +def fromstring(raw, **kw): + return _fromstring(raw, parser=parser, **kw) def tostring(root, pretty_print=False): - return html.tostring(root, encoding='utf-8', method='xml', + return _tostring(root, encoding='utf-8', method='xml', include_meta_content_type=True, pretty_print=pretty_print) @@ -372,11 +432,11 @@ class Parser(PreProcessor, LoggingInterface): for pat in ENCODING_PATS: src = pat.sub('', src) try: - self.root = html.fromstring(src) + self.root = fromstring(src) except: if self.opts.verbose: self.log_exception('lxml based parsing failed') - self.root = soupparser.fromstring(src) + self.root = soupparser.fromstring(src, makeelement=parser.makeelement) head = self.root.xpath('./head') if head: head = head[0] @@ -402,7 +462,7 @@ class Parser(PreProcessor, LoggingInterface): os.makedirs(tdir) with open(os.path.join(tdir, '%s-%s.html'%\ (os.path.basename(self.htmlfile.path), name)), 'wb') as f: - f.write(html.tostring(self.root, encoding='utf-8')) + f.write(tostring(self.root, encoding='utf-8')) self.log_debug(_('Written processed HTML to ')+f.name) @@ -443,19 +503,21 @@ class Processor(Parser): ''' LINKS_PATH = XPath('//a[@href]') + PIXEL_PAT = re.compile(r'([-]?\d+|[-]?\d*\.\d+)px') + + def __init__(self, *args, **kwargs): + Parser.__init__(self, *args, **kwargs) + temp = LoggingInterface(logging.getLogger('cssutils')) + temp.setup_cli_handler(self.opts.verbose) + self.css_parser = CSSParser(log=temp.logger, loglevel=logging.ERROR) + self.stylesheet = self.font_css = self.override_css = None def detect_chapters(self): self.detected_chapters = self.opts.chapter(self.root) for elem in self.detected_chapters: text = u' '.join([t.strip() for t in elem.xpath('descendant::text()')]) self.log_info('\tDetected chapter: %s', text[:50]) - if self.opts.chapter_mark in ('both', 'pagebreak'): - style = elem.get('style', '').strip() - if style and not style.endswith(';'): - style += '; ' - style += 'page-break-before: always' - elem.set('style', style) - if self.opts.chapter_mark in ('both', 'rule'): + if self.opts.chapter_mark != 'none': hr = etree.Element('hr') if elem.getprevious() is None: elem.getparent()[:0] = [hr] @@ -466,16 +528,28 @@ class Processor(Parser): insert = i break elem.getparent()[insert:insert] = [hr] + if self.opts.chapter_mark != 'rule': + hr.set('style', 'width:0pt;page-break-before:always') + if self.opts.chapter_mark == 'both': + hr2 = etree.Element('hr') + hr2.tail = u'\u00a0' + p = hr.getparent() + i = p.index(hr) + p[i:i] = [hr2] + def save(self): - style_path = os.path.basename(self.save_path())+'.css' - style = etree.SubElement(self.head, 'link', attrib={'type':'text/css', 'rel':'stylesheet', - 'href':'resources/'+style_path, - 'charset':'UTF-8'}) - style.tail = '\n' - style_path = os.path.join(os.path.dirname(self.save_path()), 'resources', style_path) - open(style_path, 'wb').write(self.css.encode('utf-8')) + style_path = os.path.splitext(os.path.basename(self.save_path()))[0] + for i, sheet in enumerate([self.stylesheet, self.font_css, self.override_css]): + if sheet is not None: + style = etree.SubElement(self.head, 'link', attrib={'type':'text/css', 'rel':'stylesheet', + 'href':'resources/%s_%d.css'%(style_path, i), + 'charset':'UTF-8'}) + style.tail = '\n' + path = os.path.join(os.path.dirname(self.save_path()), *(style.get('href').split('/'))) + self.resource_map[path] = style.get('href') + open(path, 'wb').write(getattr(sheet, 'cssText', sheet).encode('utf-8')) return Parser.save(self) def populate_toc(self, toc): @@ -491,14 +565,45 @@ class Processor(Parser): text = text[:50] + u'\u2026' return target.add_item(href, fragment, text, type=type) - # Add chapters to TOC + name = self.htmlfile_map[self.htmlfile.path] + href = 'content/'+name + + # Add level 1 and level 2 TOC items counter = 0 + if self.opts.level1_toc is not None: + level1 = self.opts.level1_toc(self.root) + if level1: + added = {} + for elem in level1: + text = (u''.join(elem.xpath('string()'))).strip() + if text: + id = elem.get('id', 'calibre_chapter_%d'%counter) + counter += 1 + elem.set('id', id) + added[elem] = add_item(href, id, text, toc, type='chapter') + add_item(href, id, 'Top', added[elem], type='chapter') + if self.opts.level2_toc is not None: + level2 = list(self.opts.level2_toc(self.root)) + for elem in level2: + level1 = None + for item in self.root.iterdescendants(): + if item in added.keys(): + level1 = added[item] + elif item == elem and level1 is not None: + text = (u''.join(elem.xpath('string()'))).strip() + if text: + id = elem.get('id', 'calibre_chapter_%d'%counter) + counter += 1 + elem.set('id', id) + add_item(href, id, text, level1, type='chapter') + + + # Add chapters to TOC + if not self.opts.no_chapters_in_toc: for elem in getattr(self, 'detected_chapters', []): text = (u''.join(elem.xpath('string()'))).strip() if text: - name = self.htmlfile_map[self.htmlfile.path] - href = 'content/'+name counter += 1 id = elem.get('id', 'calibre_chapter_%d'%counter) elem.set('id', id) @@ -518,8 +623,7 @@ class Processor(Parser): pass - name = self.htmlfile_map[self.htmlfile.path] - href = 'content/'+name + if referrer.href != href: # Happens for root file @@ -541,13 +645,24 @@ class Processor(Parser): name = self.htmlfile_map[self.htmlfile.referrer.path] add_item(href, fragment, text, target) - + @classmethod + def preprocess_css(cls, css, dpi=96): + def rescale(match): + val = match.group(1) + try: + val = float(val) + except ValueError: + return '' + return '%fpt'%(72 * val/dpi) - def extract_css(self): + return cls.PIXEL_PAT.sub(rescale, css) + + def extract_css(self, parsed_sheets): ''' - Remove all CSS information from the document and store in self.raw_css. - This includes tags. + Remove all CSS information from the document and store it as + :class:`StyleSheet` objects. ''' + def get_id(chapter, counter, prefix='calibre_css_'): new_id = '%s_%d'%(prefix, counter) if chapter.tag.lower() == 'a' and 'name' in chapter.keys(): @@ -562,17 +677,40 @@ class Processor(Parser): chapter.set('id', id) return id - css = [] + self.external_stylesheets, self.stylesheet = [], self.css_parser.parseString('') for link in self.root.xpath('//link'): if 'css' in link.get('type', 'text/css').lower(): - file = os.path.join(self.tdir, link.get('href', '')) - if file and os.path.exists(file) and os.path.isfile(file): - css.append(open(file, 'rb').read().decode('utf-8')) - link.getparent().remove(link) - + file = os.path.join(self.tdir, *(link.get('href', '').split('/'))) + if file and not 'http:' in file: + if not parsed_sheets.has_key(file): + try: + self.log_info('Processing stylesheet %s...'%file) + css = self.preprocess_css(open(file).read()) + except (IOError, OSError): + self.log_error('Failed to open stylesheet: %s'%file) + else: + try: + parsed_sheets[file] = self.css_parser.parseString(css) + except: + parsed_sheets[file] = css.decode('utf8', 'replace') + self.log_warning('Failed to parse stylesheet: %s'%file) + if self.opts.verbose > 1: + self.log_exception('') + if parsed_sheets.has_key(file): + self.external_stylesheets.append(parsed_sheets[file]) + + for style in self.root.xpath('//style'): if 'css' in style.get('type', 'text/css').lower(): - css.append('\n'.join(style.xpath('./text()'))) + raw = '\n'.join(style.xpath('./text()')) + css = self.preprocess_css(raw) + try: + sheet = self.css_parser.parseString(css) + except: + self.log_debug('Failed to parse style element') + else: + for rule in sheet: + self.stylesheet.add(rule) style.getparent().remove(style) cache = {} @@ -613,57 +751,19 @@ class Processor(Parser): elem.set('class', cn) elem.attrib.pop('style') - for setting, cn in cache.items(): - css.append('.%s {%s}'%(cn, setting)) - - - self.raw_css = '\n\n'.join(css) - self.css = unicode(self.raw_css) + css = '\n'.join(['.%s {%s;}'%(cn, setting) for \ + setting, cn in cache.items()]) + self.stylesheet = self.css_parser.parseString(self.preprocess_css(css)) + css = '' if self.opts.override_css: - self.css += '\n\n'+self.opts.override_css - self.do_layout() - # TODO: Figure out what to do about CSS imports from linked stylesheets - - def relativize_font_sizes(self, dpi=100, base=16): - ''' - Convert all absolute font sizes to percentages of ``base`` using ``dpi`` - to convert from screen to paper units. - :param base: Base size in pixels. Adobe DE seems to need base size to be 16 - irrespective of the unit of the length being converted - :param dpi: Dots per inch used to convert pixels to absolute lengths. Since - most HTML files are created on computers with monitors of DPI ~ 100, we use - 100 by default. - ''' - size_value_pat = re.compile(r'(?[0-9.]+)(?Pcm|mm|in|pt|pc|px)', re.I) + css += '\n\n' + self.opts.override_css + css += '\n\n' + 'body {margin-top: 0pt; margin-bottom: 0pt; margin-left: 0pt; margin-right: 0pt;}' + css += '\n\n@page {margin-top: %fpt; margin-bottom: %fpt; margin-left: %fpt; margin-right: %fpt}'%(self.opts.margin_top, self.opts.margin_bottom, self.opts.margin_left, self.opts.margin_right) + if self.opts.remove_paragraph_spacing: + css += '\n\np {text-indent: 2.1em; margin-top:1pt; margin-bottom:1pt; padding:0pt; border:0pt;}' + self.override_css = self.css_parser.parseString(self.preprocess_css(css)) - # points per unit - ptu = { # Convert to pt - 'px' : 72./dpi, - 'pt' : 1.0, - 'pc' : 1/12., - 'in' : 72., - 'cm' : 72/2.54, - 'mm' : 72/25.4, - } - def relativize(match): - val = float(match.group('num')) - unit = match.group('unit').lower() - val *= ptu[unit] - return '%.1f%%'%((val/base) * 100) - - - def sub(match): - rule = match.group(1) - value = size_value_pat.sub(relativize, match.group(2)) - return '%s : %s'%(rule, value) - - self.css = re.compile(r'(font|font-size)\s*:\s*([^;]+)', re.I).sub(sub, self.css) - - def do_layout(self): - self.css += '\nbody {margin-top: 0pt; margin-bottom: 0pt; margin-left: 0pt; margin-right: 0pt; font-size: %f%%}\n'%self.opts.base_font_size - self.css += '@page {margin-top: %fpt; margin-bottom: %fpt; margin-left: %fpt; margin-right: %fpt}\n'%(self.opts.margin_top, self.opts.margin_bottom, self.opts.margin_left, self.opts.margin_right) - def config(defaults=None, config_name='html', desc=_('Options to control the traversal of HTML')): if defaults is None: diff --git a/src/calibre/gui2/dialogs/epub.py b/src/calibre/gui2/dialogs/epub.py index 9f6dbd6dc6..78a2be0f51 100644 --- a/src/calibre/gui2/dialogs/epub.py +++ b/src/calibre/gui2/dialogs/epub.py @@ -17,6 +17,7 @@ from calibre.ebooks.epub.from_any import SOURCE_FORMATS, config from calibre.ebooks.metadata import MetaInformation from calibre.ptempfile import PersistentTemporaryFile from calibre.ebooks.metadata.opf import OPFCreator +from lxml.etree import XPath class Config(QDialog, Ui_Dialog): @@ -234,6 +235,16 @@ class Config(QDialog, Ui_Dialog): self.source_format = d.format() def accept(self): + for opt in ('chapter', 'level1_toc', 'level2_toc'): + text = unicode(getattr(self, 'opt_'+opt).text()) + if text: + try: + XPath(text,namespaces={'re':'http://exslt.org/regular-expressions'}) + except Exception, err: + error_dialog(self, _('Invalid XPath expression'), + _('The expression %s is invalid. Error: %s')%(text, err) + ).exec_() + return mi = self.get_metadata() self.read_settings() self.cover_file = None diff --git a/src/calibre/gui2/dialogs/epub.ui b/src/calibre/gui2/dialogs/epub.ui index fe4ccdef5d..3ecc0991e8 100644 --- a/src/calibre/gui2/dialogs/epub.ui +++ b/src/calibre/gui2/dialogs/epub.ui @@ -77,7 +77,7 @@ - 1 + 3 @@ -416,29 +416,36 @@ Base &font size: - opt_base_font_size + opt_base_font_size2 - + - % + pt 0 - 10.000000000000000 + 0.000000000000000 - 500.000000000000000 + 30.000000000000000 - 5.000000000000000 + 1.000000000000000 - 100.000000000000000 + 30.000000000000000 + + + + + + + Remove &spacing between paragraphs @@ -674,6 +681,32 @@ p, li { white-space: pre-wrap; } + + + + + + + Level &1 TOC + + + opt_level1_toc + + + + + + + Level &2 TOC + + + opt_level2_toc + + + + + + diff --git a/src/calibre/linux.py b/src/calibre/linux.py index 7a820d3cfa..33796f5b15 100644 --- a/src/calibre/linux.py +++ b/src/calibre/linux.py @@ -295,6 +295,11 @@ complete -o nospace -F _prs500 prs500 ''') f.close() print 'done' + except TypeError, err: + if 'resolve_entities' in str(err): + print 'You need python-lxml >= 2.0.5 for calibre' + sys.exit(1) + raise except: if fatal_errors: raise diff --git a/src/calibre/trac/plugins/download.py b/src/calibre/trac/plugins/download.py index 33049e9dc8..ca5ecabed4 100644 --- a/src/calibre/trac/plugins/download.py +++ b/src/calibre/trac/plugins/download.py @@ -45,7 +45,7 @@ class Distribution(object): INSTALLERS = ('emerge -avn', 'apt-get install', 'yum install') AS_ROOT = (True, False, True) - TITLEMAP = {'gentoo':'Gentoo', 'ubuntu':'Ubuntu Interpid Ibex', + TITLEMAP = {'gentoo':'Gentoo', 'ubuntu':'Ubuntu Intrepid Ibex', 'fedora':'Fedora 10', 'debian':'Debian sid', 'generic': 'Install from source'} MANUAL_MAP = { diff --git a/src/calibre/web/feeds/__init__.py b/src/calibre/web/feeds/__init__.py index 003e9af318..dffb9f8c56 100644 --- a/src/calibre/web/feeds/__init__.py +++ b/src/calibre/web/feeds/__init__.py @@ -5,7 +5,7 @@ __copyright__ = '2008, Kovid Goyal ' ''' Contains the logic for parsing feeds. ''' -import time, logging, traceback +import time, logging, traceback, copy from datetime import datetime from calibre.web.feeds.feedparser import parse @@ -17,7 +17,7 @@ class Article(object): def __init__(self, id, title, url, summary, published, content): self.downloaded = False self.id = id - self.title = title + self.title = title.strip() if title else title self.url = url self.summary = summary self.content = content @@ -38,7 +38,14 @@ Has content : %s def __str__(self): return repr(self) - + + def is_same_as(self, other_article): + #if self.title != getattr(other_article, 'title', False): + # return False + if self.url: + return self.url == getattr(other_article, 'url', False) + return self.content == getattr(other_article, 'content', False) + class Feed(object): @@ -169,7 +176,72 @@ class Feed(object): len(a.summary if a.summary else '')) return length > 2000 * len(self) + + def has_article(self, article): + for a in self: + if a.is_same_as(article): + return True + return False + + def find(self, article): + for i, a in enumerate(self): + if a.is_same_as(article): + return i + return -1 + + def remove(self, article): + i = self.index(article) + if i > -1: + self.articles[i:i+1] = [] +class FeedCollection(list): + + def __init__(self, feeds): + list.__init__(self, [f for f in feeds if len(f.articles) > 0]) + found_articles = set([]) + duplicates = set([]) + + def in_set(s, a): + for x in s: + if a.is_same_as(x): + return x + return None + + print '#feeds', len(self) + print map(len, self) + for f in self: + dups = [] + for a in f: + first = in_set(found_articles, a) + if first is not None: + dups.append(a) + duplicates.add((first, f)) + else: + found_articles.add(a) + for x in dups: + f.articles.remove(x) + + self.duplicates = duplicates + print len(duplicates) + print map(len, self) + #raise + + def find_article(self, article): + for j, f in enumerate(self): + for i, a in enumerate(f): + if a is article: + return (j, i) + + def restore_duplicates(self): + temp = [] + for article, feed in self.duplicates: + art = copy.deepcopy(article) + j, i = self.find_article(article) + art.url = '../feed_%d/article_%d/index.html'%(j, i) + temp.append((feed, art)) + for feed, art in temp: + feed.articles.append(art) + def feed_from_xml(raw_xml, title=None, oldest_article=7, max_articles_per_feed=100, get_article_url=lambda item: item.get('link', None)): diff --git a/src/calibre/web/feeds/news.py b/src/calibre/web/feeds/news.py index 6a9e9acd52..212ca84aac 100644 --- a/src/calibre/web/feeds/news.py +++ b/src/calibre/web/feeds/news.py @@ -289,15 +289,16 @@ class BasicNewsRecipe(object, LoggingInterface): ''' return soup - def postprocess_html(self, soup): + def postprocess_html(self, soup, first_fetch): ''' This method is called with the source of each downloaded :term:`HTML` file, after it is parsed for links and images. It can be used to do arbitrarily powerful post-processing on the :term:`HTML`. It should return `soup` after processing it. - `soup`: A `BeautifulSoup `_ + :param soup: A `BeautifulSoup `_ instance containing the downloaded :term:`HTML`. + :param first_fetch: True if this is the first page of an article. ''' return soup @@ -482,7 +483,7 @@ class BasicNewsRecipe(object, LoggingInterface): elem = BeautifulSoup(templ.render(doctype='xhtml').decode('utf-8')).find('div') body.insert(0, elem) - return self.postprocess_html(soup) + return self.postprocess_html(soup, first_fetch) def download(self): diff --git a/src/calibre/web/feeds/recipes/espn.py b/src/calibre/web/feeds/recipes/espn.py index d8c33847cf..34a1bc609a 100644 --- a/src/calibre/web/feeds/recipes/espn.py +++ b/src/calibre/web/feeds/recipes/espn.py @@ -67,7 +67,7 @@ class ESPN(BasicNewsRecipe): return soup - def postprocess_html(self, soup): + def postprocess_html(self, soup, first_fetch): for div in soup.findAll('div', style=True): div['style'] = div['style'].replace('center', 'left') return soup diff --git a/src/calibre/web/feeds/recipes/newsweek.py b/src/calibre/web/feeds/recipes/newsweek.py index 0da8b8965d..9ad551c469 100644 --- a/src/calibre/web/feeds/recipes/newsweek.py +++ b/src/calibre/web/feeds/recipes/newsweek.py @@ -92,7 +92,7 @@ class Newsweek(BasicNewsRecipe): return sections - def postprocess_html(self, soup): + def postprocess_html(self, soup, first_fetch): divs = list(soup.findAll('div', 'pagination')) if not divs: return diff --git a/src/calibre/web/feeds/recipes/outlook_india.py b/src/calibre/web/feeds/recipes/outlook_india.py index c5782d1536..db8ad900ab 100644 --- a/src/calibre/web/feeds/recipes/outlook_india.py +++ b/src/calibre/web/feeds/recipes/outlook_india.py @@ -73,7 +73,7 @@ class OutlookIndia(BasicNewsRecipe): return feeds - def postprocess_html(self, soup): + def postprocess_html(self, soup, first_fetch): bad = [] for table in soup.findAll('table'): if table.find(text=re.compile(r'\(\d+ of \d+\)')): diff --git a/src/calibre/web/feeds/recipes/scientific_american.py b/src/calibre/web/feeds/recipes/scientific_american.py index b9ca0f131f..7d22013aaf 100644 --- a/src/calibre/web/feeds/recipes/scientific_american.py +++ b/src/calibre/web/feeds/recipes/scientific_american.py @@ -7,14 +7,16 @@ __docformat__ = 'restructuredtext en' sciam.com ''' import re +from lxml import html from calibre.web.feeds.news import BasicNewsRecipe class ScientificAmerican(BasicNewsRecipe): title = u'Scientific American' - description = u'Popular science' + description = u'Popular science. Monthly magazine.' __author__ = 'Kovid Goyal' oldest_article = 30 max_articles_per_feed = 100 + no_stylesheets = True use_embedded_content = False remove_tags_before = dict(name='div', attrs={'class':'headline'}) remove_tags_after = dict(id='article') @@ -26,25 +28,102 @@ class ScientificAmerican(BasicNewsRecipe): html2lrf_options = ['--base-font-size', '8'] recursions = 1 match_regexps = [r'article.cfm.id=\S+page=(2|3|4|5|6|7|8|9|10|11|12|13|14)'] - feeds = [ - (u'Latest News', u'http://rss.sciam.com/ScientificAmerican-News'), - (u'Global', u'http://rss.sciam.com/ScientificAmerican-Global'), - (u'Health', u'http://rss.sciam.com/sciam/health'), - (u'Space', u'http://rss.sciam.com/sciam/space'), - (u'Technology', u'http://rss.sciam.com/sciam/technology'), - (u'Biology', u'http://rss.sciam.com/sciam/biology'), - (u'Mind & Brain', u'http://rss.sciam.com/sciam/mind-and-brain'), - (u"What's Next", u'http://rss.sciam.com/sciam/whats-next'), - (u'Archeology and Paleontology', u'http://www.sciam.com/page.cfm?section=rsscategory&alias=archaeology-and-paleontology'), - (u'Physics', u'http://www.sciam.com/page.cfm?section=rsscategory&alias=physics'), - (u'Math', u'http://rss.sciam.com/sciam/math'), - (u'History of Science', u'http://www.sciam.com/page.cfm?section=rsscategory&alias=history-of-science'), - (u'Chemistry', u'http://rss.sciam.com/sciam/chemistry'), - (u'Mind Matters', u'http://rss.sciam.com/ScientificAmerican-MindBlog') - ] +# feeds = [ +# (u'Latest News', u'http://rss.sciam.com/ScientificAmerican-News'), +# (u'Global', u'http://rss.sciam.com/ScientificAmerican-Global'), +# (u'Health', u'http://rss.sciam.com/sciam/health'), +# (u'Space', u'http://rss.sciam.com/sciam/space'), +# (u'Technology', u'http://rss.sciam.com/sciam/technology'), +# (u'Biology', u'http://rss.sciam.com/sciam/biology'), +# (u'Mind & Brain', u'http://rss.sciam.com/sciam/mind-and-brain'), +# (u"What's Next", u'http://rss.sciam.com/sciam/whats-next'), +# (u'Archeology and Paleontology', u'http://www.sciam.com/page.cfm?section=rsscategory&alias=archaeology-and-paleontology'), +# (u'Physics', u'http://www.sciam.com/page.cfm?section=rsscategory&alias=physics'), +# (u'Math', u'http://rss.sciam.com/sciam/math'), +# (u'History of Science', u'http://www.sciam.com/page.cfm?section=rsscategory&alias=history-of-science'), +# (u'Chemistry', u'http://rss.sciam.com/sciam/chemistry'), +# (u'Mind Matters', u'http://rss.sciam.com/ScientificAmerican-MindBlog') +# ] +# + def parse_index(self): + src = self.browser.open('http://www.sciam.com/sciammag/').read() + root = html.fromstring(src) + self.cover_url = root.xpath('//img[re:match(@src, "cover_")]', + namespaces={'re':'http://exslt.org/regular-expressions'} + )[0].get('src') + self.timefmt = root.xpath('//div[@id = "magazine-month"]')[0].text + feeds = [] + features = [] + for a in root.xpath('//a[@href and @title = "Feature"]'): + if not a.text.strip(): + continue + article = { + 'url' : a.get('href'), + 'title' : u''.join(a.xpath('./text()')), + 'date' : '', + 'description' : '', + } + for s in a.itersiblings('span'): + if s.get('class', '') == 'sub': + article['description'] += u''.join(s.xpath('./text()')) + ' ' + features.append(article) + if features: + feeds.append(('Features', features)) + + departments = [] + for a in root.xpath('//a[@href and @class="title"]'): + txt = u''.join(a.xpath('./text()')).strip() + if not txt: + continue + article = { + 'url' : a.get('href'), + 'title' : txt, + 'date' : '', + 'description' : '', + } + p = a.getparent() + p.remove(a) + article['description'] = u''.join(p.xpath('./text()')) + departments.append(article) + + feeds.append(('Departments', departments)) + opinion = [] + for a in root.xpath('//div[@id = "opinion"]//a[@href]'): + txt = u''.join(a.xpath('./text()')).strip() + if not txt: + continue + article = { + 'url' : a.get('href'), + 'title' : txt, + 'date' : '', + 'description' : '', + } + opinion.append(article) + feeds.append(('Opinion', opinion)) + + ontheweb = [] + for a in root.xpath('//div[@id = "ontheweb"]//a[@href]'): + txt = u''.join(a.xpath('./text()')).strip() + if not txt: + continue + article = { + 'url' : a.get('href'), + 'title' : txt, + 'date' : '', + 'description' : '', + } + ontheweb.append(article) + feeds.append(('On the web', ontheweb)) + + return feeds + - def postprocess_html(self, soup): + def postprocess_html(self, soup, first_fetch): if soup is not None: for span in soup.findAll('span', attrs={'class':'pagination'}): span.extract() + if not first_fetch: + div = soup.find('div', attrs={'class':'headline'}) + if div: + div.extract() return soup diff --git a/src/calibre/web/fetch/simple.py b/src/calibre/web/fetch/simple.py index bd867a2045..c220e8390f 100644 --- a/src/calibre/web/fetch/simple.py +++ b/src/calibre/web/fetch/simple.py @@ -198,7 +198,7 @@ class RecursiveFetcher(object, LoggingInterface): try: f = self.fetch_url(iurl) except Exception, err: - self.log_warning('Could not fetch stylesheet %s', iurl) + self.log_debug('Could not fetch stylesheet %s', iurl) self.log_debug('Error: %s', str(err), exc_info=True) continue stylepath = os.path.join(diskpath, 'style'+str(c)+'.css')