diff --git a/src/calibre/__init__.py b/src/calibre/__init__.py index 1f33e34483..61d7801cb4 100644 --- a/src/calibre/__init__.py +++ b/src/calibre/__init__.py @@ -284,7 +284,7 @@ def english_sort(x, y): class LoggingInterface: def __init__(self, logger): - self.__logger = logger + self.__logger = self.logger = logger def setup_cli_handler(self, verbosity): for handler in self.__logger.handlers: diff --git a/src/calibre/ebooks/epub/__init__.py b/src/calibre/ebooks/epub/__init__.py index bf151d840c..1d8b6e6b4c 100644 --- a/src/calibre/ebooks/epub/__init__.py +++ b/src/calibre/ebooks/epub/__init__.py @@ -7,7 +7,6 @@ __docformat__ = 'restructuredtext en' Conversion to EPUB. ''' import sys, textwrap -from lxml import html from calibre.utils.config import Config, StringConfig from calibre.utils.zipfile import ZipFile, ZIP_STORED from calibre.ebooks.html import config as common_config, tostring @@ -16,13 +15,11 @@ class DefaultProfile(object): flow_size = sys.maxint screen_size = None - dpi = 100 class PRS505(DefaultProfile): flow_size = 300000 screen_size = (600, 775) - dpi = 166 PROFILES = { @@ -30,6 +27,13 @@ PROFILES = { 'None' : DefaultProfile, } +def rules(stylesheets): + for s in stylesheets: + if hasattr(s, 'cssText'): + for r in s: + if r.type == r.STYLE_RULE: + yield r + def initialize_container(path_to_container, opf_name='metadata.opf'): ''' Create an empty EPUB document, with a default skeleton. @@ -95,6 +99,12 @@ to auto-generate a Table of Contents. help=_("Don't add auto-detected chapters to the Table of Contents.")) toc('toc_threshold', ['--toc-threshold'], default=6, help=_('If fewer than this number of chapters is detected, then links are added to the Table of Contents.')) + toc('level1_toc', ['--level1-toc'], default=None, + help=_('XPath expression that specifies all tags that should be added to the Table of Contents at level one. If this is specified, it takes precedence over other forms of auto-detection.')) + toc('level2_toc', ['--level2-toc'], default=None, + help=_('XPath expression that specifies all tags that should be added to the Table of Contents at level two. Each entry is added under the previous level one entry.')) + toc('from_ncx', ['--from-ncx'], default=None, + help=_('Path to a .ncx file that contains the table of contents to use for this ebook. The NCX file should contain links relative to the directory it is placed in. See http://www.niso.org/workrooms/daisy/Z39-86-2005.html#NCX for an overview of the NCX format.')) toc('use_auto_toc', ['--use-auto-toc'], default=False, help=_('Normally, if the source file already has a Table of Contents, it is used in preference to the autodetected one. With this option, the autodetected one is always used.')) @@ -107,8 +117,10 @@ to auto-generate a Table of Contents. help=_('Set the left margin in pts. Default is %default')) layout('margin_right', ['--margin-right'], default=5.0, help=_('Set the right margin in pts. Default is %default')) - layout('base_font_size', ['--base-font-size'], default=100.0, - help=_('The base font size as a percentage. Default is %default. Changing this should allow you to control overall base font sizes, except for input HTML files that use absolute font sizes for their text tags.')) + layout('base_font_size2', ['--base-font-size'], default=12.0, + help=_('The base font size in pts. Default is %defaultpt. Set to 0 to disable rescaling of fonts.')) + layout('remove_paragraph_spacing', ['--remove-paragraph-spacing'], default=True, + help=_('Remove spacing between paragraphs. Will not work if the source file forces inter-paragraph spacing.')) c.add_opt('show_opf', ['--show-opf'], default=False, group='debug', help=_('Print generated OPF file to stdout')) diff --git a/src/calibre/ebooks/epub/fonts.py b/src/calibre/ebooks/epub/fonts.py new file mode 100644 index 0000000000..5d0887f2d0 --- /dev/null +++ b/src/calibre/ebooks/epub/fonts.py @@ -0,0 +1,300 @@ +#!/usr/bin/env python +__license__ = 'GPL v3' +__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net' +__docformat__ = 'restructuredtext en' + +''' +Font size rationalization. See :function:`relativize`. +''' + +import logging, re, operator, functools, collections, unittest, copy, sys +from xml.dom import SyntaxErr + +from lxml.cssselect import CSSSelector +from lxml import etree +from lxml.html import HtmlElement + +from calibre.ebooks.html import fromstring +from calibre.ebooks.epub import rules +from cssutils import CSSParser + +num = r'[-]?\d+|[-]?\d*\.\d+' +length = r'(?P0)|(?P{num})(?P%|em|ex|px|in|cm|mm|pt|pc)'.replace('{num}', num) +absolute_size = r'(?P(x?x-)?(small|large)|medium)' +relative_size = r'(?Psmaller|larger)' + +font_size_pat = re.compile('|'.join((relative_size, absolute_size, length)), re.I) +line_height_pat = re.compile(r'({num})(px|in|cm|mm|pt|pc)'.replace('{num}', num)) + +PTU = { + 'in' : 72., + 'cm' : 72/2.54, + 'mm' : 72/25.4, + 'pt' : 1.0, + 'pc' : 1/12., + } + +DEFAULT_FONT_SIZE = 12 + +class Rationalizer(object): + + @classmethod + def specificity(cls, s): + '''Map CSS specificity tuple to a single integer''' + return sum([10**(4-i) + x for i,x in enumerate(s)]) + + @classmethod + def compute_font_size(cls, elem): + ''' + Calculate the effective font size of an element traversing its ancestors as far as + neccessary. + ''' + cfs = elem.computed_font_size + if cfs is not None: + return + sfs = elem.specified_font_size + if callable(sfs): + parent = elem.getparent() + cls.compute_font_size(parent) + elem.computed_font_size = sfs(parent.computed_font_size) + else: + elem.computed_font_size = sfs + + @classmethod + def calculate_font_size(cls, style): + 'Return font size in pts from style object. For relative units returns a callable' + match = font_size_pat.search(style.font) + fs = '' + if match: + fs = match.group() + if style.fontSize: + fs = style.fontSize + + match = font_size_pat.search(fs) + if match is None: + return None + match = match.groupdict() + unit = match.get('unit', '') + if unit: unit = unit.lower() + if unit in PTU.keys(): + return PTU[unit] * float(match['num']) + if unit in ('em', 'ex'): + return functools.partial(operator.mul, float(match['num'])) + if unit == '%': + return functools.partial(operator.mul, float(match['num'])/100.) + abs = match.get('abs', '') + if abs: abs = abs.lower() + if abs: + x = (1.2)**(abs.count('x') * (-1 if 'small' in abs else 1)) + return 12 * x + if match.get('zero', False): + return 0. + return functools.partial(operator.mul, 1.2) if 'larger' in fs.lower() else functools.partial(operator.mul, 0.8) + + @classmethod + def resolve_rules(cls, stylesheets): + for sheet in stylesheets: + if hasattr(sheet, 'fs_rules'): + continue + sheet.fs_rules = [] + sheet.lh_rules = [] + for r in sheet: + if r.type == r.STYLE_RULE: + font_size = cls.calculate_font_size(r.style) + if font_size is not None: + for s in r.selectorList: + sheet.fs_rules.append([CSSSelector(s.selectorText), font_size]) + orig = line_height_pat.search(r.style.lineHeight) + if orig is not None: + for s in r.selectorList: + sheet.lh_rules.append([CSSSelector(s.selectorText), float(orig.group(1)) * PTU[orig.group(2).lower()]]) + + + @classmethod + def apply_font_size_rules(cls, stylesheets, root): + 'Add a ``specified_font_size`` attribute to every element that has a specified font size' + cls.resolve_rules(stylesheets) + for sheet in stylesheets: + for selector, font_size in sheet.fs_rules: + elems = selector(root) + for elem in elems: + elem.specified_font_size = font_size + + @classmethod + def remove_font_size_information(cls, stylesheets): + for r in rules(stylesheets): + r.style.removeProperty('font-size') + try: + new = font_size_pat.sub('', r.style.font).strip() + if new: + r.style.font = new + else: + r.style.removeProperty('font') + except SyntaxErr: + r.style.removeProperty('font') + if line_height_pat.search(r.style.lineHeight) is not None: + r.style.removeProperty('line-height') + + @classmethod + def compute_font_sizes(cls, root, stylesheets, base=12): + stylesheets = [s for s in stylesheets if hasattr(s, 'cssText')] + cls.apply_font_size_rules(stylesheets, root) + + # Compute the effective font size of all tags + root.computed_font_size = DEFAULT_FONT_SIZE + for elem in root.iter(etree.Element): + cls.compute_font_size(elem) + + extra_css = {} + if base > 0: + # Calculate the "base" (i.e. most common) font size + font_sizes = collections.defaultdict(lambda : 0) + body = root.xpath('//body')[0] + IGNORE = ('h1', 'h2', 'h3', 'h4', 'h5', 'h6') + for elem in body.iter(etree.Element): + if elem.tag not in IGNORE: + t = getattr(elem, 'text', '') + if t: t = t.strip() + if t: + font_sizes[elem.computed_font_size] += len(t) + + t = getattr(elem, 'tail', '') + if t: t = t.strip() + if t: + parent = elem.getparent() + if parent.tag not in IGNORE: + font_sizes[parent.computed_font_size] += len(t) + + try: + most_common = max(font_sizes.items(), key=operator.itemgetter(1))[0] + scale = base/most_common if most_common > 0 else 1. + except ValueError: + scale = 1. + + # rescale absolute line-heights + counter = 0 + for sheet in stylesheets: + for selector, lh in sheet.lh_rules: + for elem in selector(root): + elem.set('id', elem.get('id', 'cfs_%d'%counter)) + counter += 1 + if not extra_css.has_key(elem.get('id')): + extra_css[elem.get('id')] = [] + extra_css[elem.get('id')].append('line-height:%fpt'%(lh*scale)) + + + + # Rescale all computed font sizes + for elem in body.iter(etree.Element): + if isinstance(elem, HtmlElement): + elem.computed_font_size *= scale + + # Remove all font size specifications from the last stylesheet + cls.remove_font_size_information(stylesheets[-1:]) + + # Create the CSS to implement the rescaled font sizes + for elem in body.iter(etree.Element): + cfs, pcfs = map(operator.attrgetter('computed_font_size'), (elem, elem.getparent())) + if abs(cfs-pcfs) > 1/12. and abs(pcfs) > 1/12.: + elem.set('id', elem.get('id', 'cfs_%d'%counter)) + counter += 1 + if not extra_css.has_key(elem.get('id')): + extra_css[elem.get('id')] = [] + extra_css[elem.get('id')].append('font-size: %f%%'%(100*(cfs/pcfs))) + + css = CSSParser(loglevel=logging.ERROR).parseString('') + for id, r in extra_css.items(): + css.add('#%s {%s}'%(id, ';'.join(r))) + return css + + @classmethod + def rationalize(cls, stylesheets, root, opts): + logger = logging.getLogger('html2epub') + logger.info('\t\tRationalizing fonts...') + extra_css = None + if opts.base_font_size2 > 0: + try: + extra_css = cls.compute_font_sizes(root, stylesheets, base=opts.base_font_size2) + except: + logger.warning('Failed to rationalize font sizes.') + if opts.verbose > 1: + logger.exception('') + finally: + root.remove_font_size_information() + logger.debug('\t\tDone rationalizing') + return extra_css + +################################################################################ +############## Testing +################################################################################ + +class FontTest(unittest.TestCase): + + def setUp(self): + from calibre.ebooks.epub import config + self.opts = config(defaults='').parse() + self.html = ''' + + + Test document + + +
+ +

Some text

+
+

Some other text.

+

The longest piece of single font size text in this entire file. Used to test resizing.

+ + + ''' + self.root = fromstring(self.html) + + def do_test(self, css, base=DEFAULT_FONT_SIZE, scale=1): + root1 = copy.deepcopy(self.root) + root1.computed_font_size = DEFAULT_FONT_SIZE + stylesheet = CSSParser(loglevel=logging.ERROR).parseString(css) + stylesheet2 = Rationalizer.compute_font_sizes(root1, [stylesheet], base) + root2 = copy.deepcopy(root1) + root2.remove_font_size_information() + root2.computed_font_size = DEFAULT_FONT_SIZE + Rationalizer.apply_font_size_rules([stylesheet2], root2) + for elem in root2.iter(etree.Element): + Rationalizer.compute_font_size(elem) + for e1, e2 in zip(root1.xpath('//body')[0].iter(etree.Element), root2.xpath('//body')[0].iter(etree.Element)): + self.assertAlmostEqual(e1.computed_font_size, e2.computed_font_size, + msg='Computed font sizes for %s not equal. Original: %f Processed: %f'%\ + (root1.getroottree().getpath(e1), e1.computed_font_size, e2.computed_font_size)) + return stylesheet2.cssText + + def testStripping(self): + 'Test that any original entries are removed from the CSS' + css = 'p { font: bold 10px italic smaller; font-size: x-large} \na { font-size: 0 }' + css = CSSParser(loglevel=logging.ERROR).parseString(css) + Rationalizer.compute_font_sizes(copy.deepcopy(self.root), [css]) + self.assertEqual(css.cssText.replace(' ', '').replace('\n', ''), + 'p{font:bolditalic}') + + def testIdentity(self): + 'Test that no unnecessary font size changes are made' + extra_css = self.do_test('div {font-size:12pt} \nspan {font-size:100%}') + self.assertEqual(extra_css.strip(), '') + + def testRelativization(self): + 'Test conversion of absolute to relative sizes' + self.do_test('#p1 {font: 24pt} b {font: 12pt} .it {font: 48pt} #p2 {font: 100%}') + + def testResizing(self): + 'Test resizing of fonts' + self.do_test('#longest {font: 24pt} .it {font:20pt; line-height:22pt}') + + +def suite(): + return unittest.TestLoader().loadTestsFromTestCase(FontTest) + +def test(): + unittest.TextTestRunner(verbosity=2).run(suite()) + +if __name__ == '__main__': + sys.exit(test()) + \ No newline at end of file diff --git a/src/calibre/ebooks/epub/from_html.py b/src/calibre/ebooks/epub/from_html.py index e5fe93ce27..54f91e9f0f 100644 --- a/src/calibre/ebooks/epub/from_html.py +++ b/src/calibre/ebooks/epub/from_html.py @@ -32,8 +32,7 @@ Conversion of HTML/OPF files follows several stages: * The EPUB container is created. ''' -import os, sys, re, cStringIO, logging -from contextlib import nested +import os, sys, cStringIO, logging from lxml.etree import XPath try: @@ -41,7 +40,7 @@ try: except ImportError: import Image as PILImage -from calibre.ebooks.html import Processor, get_text, merge_metadata, get_filelist,\ +from calibre.ebooks.html import Processor, merge_metadata, get_filelist,\ opf_traverse, create_metadata, rebase_toc from calibre.ebooks.epub import config as common_config from calibre.ptempfile import TemporaryDirectory @@ -50,21 +49,23 @@ from calibre.ebooks.metadata.toc import TOC from calibre.ebooks.metadata.opf2 import OPF from calibre.ebooks.epub import initialize_container, PROFILES from calibre.ebooks.epub.split import split +from calibre.ebooks.epub.fonts import Rationalizer from calibre.constants import preferred_encoding -class HTMLProcessor(Processor): +class HTMLProcessor(Processor, Rationalizer): - def __init__(self, htmlfile, opts, tdir, resource_map, htmlfiles): + def __init__(self, htmlfile, opts, tdir, resource_map, htmlfiles, stylesheets): Processor.__init__(self, htmlfile, opts, tdir, resource_map, htmlfiles, - name='html2epub') + name='html2epub') if opts.verbose > 2: self.debug_tree('parsed') self.detect_chapters() - - self.extract_css() - self.relativize_font_sizes() + self.extract_css(stylesheets) + if self.opts.base_font_size2 > 0: + self.font_css = self.rationalize(self.external_stylesheets+[self.stylesheet], + self.root, self.opts) if opts.verbose > 2: self.debug_tree('nocss') @@ -73,19 +74,6 @@ class HTMLProcessor(Processor): meta.getparent().remove(meta) Processor.save(self) - #self.collect_font_statistics() - - - def collect_font_statistics(self): - ''' - Collect font statistics to figure out the base font size used in this - HTML document. - ''' - self.font_statistics = {} #: A mapping of font size (in pts) to number of characters rendered at that font size - for text in get_text(self.body if self.body is not None else self.root): - length, parent = len(re.sub(r'\s+', '', text)), text.getparent() - #TODO: Use cssutils on self.raw_css to figure out the font size - # of this piece of text and update statistics accordingly @@ -104,21 +92,30 @@ the element of the OPF file. def parse_content(filelist, opts, tdir): os.makedirs(os.path.join(tdir, 'content', 'resources')) - resource_map = {} + resource_map, stylesheets = {}, {} toc = TOC(base_path=tdir, type='root') + stylesheet_map = {} for htmlfile in filelist: + logging.getLogger('html2epub').debug('Processing %s...'%htmlfile) hp = HTMLProcessor(htmlfile, opts, os.path.join(tdir, 'content'), - resource_map, filelist) + resource_map, filelist, stylesheets) hp.populate_toc(toc) hp.save() + stylesheet_map[os.path.basename(hp.save_path())] = \ + [s for s in hp.external_stylesheets + [hp.stylesheet, hp.font_css, hp.override_css] if s is not None] + logging.getLogger('html2epub').debug('Saving stylesheets...') + if opts.base_font_size2 > 0: + Rationalizer.remove_font_size_information(stylesheets.values()) + for path, css in stylesheets.items(): + open(path, 'wb').write(getattr(css, 'cssText', css).encode('utf-8')) if toc.count('chapter') > opts.toc_threshold: toc.purge(['file', 'link', 'unknown']) if toc.count('chapter') + toc.count('file') > opts.toc_threshold: toc.purge(['link', 'unknown']) toc.purge(['link'], max=opts.max_toc_links) - return resource_map, hp.htmlfile_map, toc + return resource_map, hp.htmlfile_map, toc, stylesheet_map def resize_cover(im, opts): width, height = im.size @@ -176,7 +173,7 @@ def process_title_page(mi, filelist, htmlfilemap, opts, tdir): Cover - +
cover
@@ -212,11 +209,22 @@ def convert(htmlfile, opts, notification=None): mi = merge_metadata(htmlfile, opf, opts) opts.chapter = XPath(opts.chapter, namespaces={'re':'http://exslt.org/regular-expressions'}) + if opts.level1_toc: + opts.level1_toc = XPath(opts.level1_toc, + namespaces={'re':'http://exslt.org/regular-expressions'}) + else: + opts.level1_toc = None + if opts.level2_toc: + opts.level2_toc = XPath(opts.level2_toc, + namespaces={'re':'http://exslt.org/regular-expressions'}) + else: + opts.level2_toc = None with TemporaryDirectory(suffix='_html2epub', keep=opts.keep_intermediate) as tdir: if opts.keep_intermediate: print 'Intermediate files in', tdir - resource_map, htmlfile_map, generated_toc = parse_content(filelist, opts, tdir) + resource_map, htmlfile_map, generated_toc, stylesheet_map = \ + parse_content(filelist, opts, tdir) logger = logging.getLogger('html2epub') resources = [os.path.join(tdir, 'content', f) for f in resource_map.values()] @@ -235,6 +243,10 @@ def convert(htmlfile, opts, notification=None): rebase_toc(mi.toc, htmlfile_map, tdir) if opts.use_auto_toc or mi.toc is None or len(list(mi.toc.flat())) < 2: mi.toc = generated_toc + if opts.from_ncx: + toc = TOC() + toc.read_ncx_toc(opts.from_ncx) + mi.toc = toc for item in mi.manifest: if getattr(item, 'mime_type', None) == 'text/html': item.mime_type = 'application/xhtml+xml' @@ -247,7 +259,7 @@ def convert(htmlfile, opts, notification=None): f.write(toc) if opts.show_ncx: print toc - split(opf_path, opts) + split(opf_path, opts, stylesheet_map) opf = OPF(opf_path, tdir) opf.remove_guide() if has_title_page: diff --git a/src/calibre/ebooks/epub/split.py b/src/calibre/ebooks/epub/split.py index 11df503dc4..30d3857941 100644 --- a/src/calibre/ebooks/epub/split.py +++ b/src/calibre/ebooks/epub/split.py @@ -12,10 +12,9 @@ import os, math, logging, functools, collections, re, copy from lxml.etree import XPath as _XPath from lxml import etree, html from lxml.cssselect import CSSSelector -from cssutils import CSSParser from calibre.ebooks.metadata.opf2 import OPF -from calibre.ebooks.epub import tostring +from calibre.ebooks.epub import tostring, rules from calibre import CurrentDir, LoggingInterface XPath = functools.partial(_XPath, namespaces={'re':'http://exslt.org/regular-expressions'}) @@ -35,7 +34,7 @@ class SplitError(ValueError): class Splitter(LoggingInterface): - def __init__(self, path, opts, always_remove=False): + def __init__(self, path, opts, stylesheet_map, always_remove=False): LoggingInterface.__init__(self, logging.getLogger('htmlsplit')) self.setup_cli_handler(opts.verbose) self.path = path @@ -46,22 +45,8 @@ class Splitter(LoggingInterface): self.log_info('\tSplitting %s (%d KB)', path, self.orig_size/1024.) root = html.fromstring(open(content(path)).read()) - css = XPath('//link[@type = "text/css" and @rel = "stylesheet"]')(root) - if css: - cssp = os.path.join('content', *(css[0].get('href').split('/'))) - self.log_debug('\t\tParsing stylesheet...') - try: - stylesheet = CSSParser().parseString(open(cssp, 'rb').read()) - except: - self.log_warn('Failed to parse CSS. Splitting on page-breaks is disabled') - if self.opts.verbose > 1: - self.log_exception('') - stylesheet = None - else: - stylesheet = None self.page_breaks = [] - if stylesheet is not None: - self.find_page_breaks(stylesheet, root) + self.find_page_breaks(stylesheet_map[self.path], root) self.trees = [] self.split_size = 0 @@ -189,14 +174,12 @@ class Splitter(LoggingInterface): self.split(t) - def find_page_breaks(self, stylesheet, root): + def find_page_breaks(self, stylesheets, root): ''' Find all elements that have either page-break-before or page-break-after set. ''' page_break_selectors = set([]) - for rule in stylesheet: - if rule.type != rule.STYLE_RULE: - continue + for rule in rules(stylesheets): before = getattr(rule.style.getPropertyCSSValue('page-break-before'), 'cssText', '').strip().lower() after = getattr(rule.style.getPropertyCSSValue('page-break-after'), 'cssText', '').strip().lower() try: @@ -385,7 +368,7 @@ def fix_ncx(path, changes): if changed: open(path, 'wb').write(etree.tostring(tree.getroot(), encoding='UTF-8', xml_declaration=True)) -def split(pathtoopf, opts): +def split(pathtoopf, opts, stylesheet_map): pathtoopf = os.path.abspath(pathtoopf) with CurrentDir(os.path.dirname(pathtoopf)): opf = OPF(open(pathtoopf, 'rb'), os.path.dirname(pathtoopf)) @@ -403,7 +386,7 @@ def split(pathtoopf, opts): for f in html_files: if os.stat(content(f)).st_size > opts.profile.flow_size: try: - changes.append(Splitter(f, opts, + changes.append(Splitter(f, opts, stylesheet_map, always_remove=(always_remove or \ os.stat(content(f)).st_size > 5*opts.profile.flow_size))) except (SplitError, RuntimeError): diff --git a/src/calibre/ebooks/html.py b/src/calibre/ebooks/html.py index e22f2bbc0e..e5e6f3f37a 100644 --- a/src/calibre/ebooks/html.py +++ b/src/calibre/ebooks/html.py @@ -8,12 +8,14 @@ Code to recursively parse HTML files and create an open ebook in a specified directory or zip file. All the action starts in :function:`create_dir`. ''' -import sys, re, os, shutil, logging, tempfile, cStringIO +import sys, re, os, shutil, logging, tempfile, cStringIO, operator, functools from urlparse import urlparse from urllib import unquote -from lxml import html, etree -from lxml.html import soupparser +from lxml import etree +from lxml.html import HtmlElementClassLookup, HTMLParser as _HTMLParser, \ + fromstring as _fromstring, tostring as _tostring, \ + soupparser, HtmlElement from lxml.etree import XPath get_text = XPath("//text()") @@ -25,9 +27,67 @@ from calibre.ebooks.metadata.meta import get_metadata from calibre.ebooks.metadata.opf2 import OPF, OPFCreator from calibre.ptempfile import PersistentTemporaryDirectory, PersistentTemporaryFile from calibre.utils.zipfile import ZipFile +from cssutils import CSSParser + +class HTMLElement(HtmlElement): + + @apply + def specified_font_size(): + + def fget(self): + ans = self.get('specified_font_size', '') + if not ans: + return lambda x: x + if ans.startswith('f'): + return functools.partial(operator.mul, float(ans[1:])) + return float(ans) + + def fset(self, val): + self.set('specified_font_size', ('f'+repr(val(1))) if callable(val) else repr(val)) + + return property(fget=fget, fset=fset) + + @apply + def computed_font_size(): + def fget(self): + ans = self.get('computed_font_size', '') + if ans == '': + return None + return float(ans) + + def fset(self, val): + self.set('computed_font_size', repr(val)) + + return property(fget=fget, fset=fset) + + def remove_font_size_information(self): + for elem in self.iter(): + for p in ('computed', 'specified'): + elem.attrib.pop(p+'_font_size', None) + + def getpath(self): + return self.getroottree().getpath(self) + +class Lookup(HtmlElementClassLookup): + + def lookup(self, node_type, document, namespace, name): + if node_type == 'element': + return HTMLElement + return HtmlElementClassLookup.lookup(self, node_type, document, namespace, name) + +class HTMLParser(_HTMLParser): + + def __init__(self, **kwargs): + super(HTMLParser, self).__init__(**kwargs) + self.set_element_class_lookup(Lookup()) + +parser = HTMLParser() + +def fromstring(raw, **kw): + return _fromstring(raw, parser=parser, **kw) def tostring(root, pretty_print=False): - return html.tostring(root, encoding='utf-8', method='xml', + return _tostring(root, encoding='utf-8', method='xml', include_meta_content_type=True, pretty_print=pretty_print) @@ -372,11 +432,11 @@ class Parser(PreProcessor, LoggingInterface): for pat in ENCODING_PATS: src = pat.sub('', src) try: - self.root = html.fromstring(src) + self.root = fromstring(src) except: if self.opts.verbose: self.log_exception('lxml based parsing failed') - self.root = soupparser.fromstring(src) + self.root = soupparser.fromstring(src, makeelement=parser.makeelement) head = self.root.xpath('./head') if head: head = head[0] @@ -402,7 +462,7 @@ class Parser(PreProcessor, LoggingInterface): os.makedirs(tdir) with open(os.path.join(tdir, '%s-%s.html'%\ (os.path.basename(self.htmlfile.path), name)), 'wb') as f: - f.write(html.tostring(self.root, encoding='utf-8')) + f.write(tostring(self.root, encoding='utf-8')) self.log_debug(_('Written processed HTML to ')+f.name) @@ -443,19 +503,21 @@ class Processor(Parser): ''' LINKS_PATH = XPath('//a[@href]') + PIXEL_PAT = re.compile(r'([-]?\d+|[-]?\d*\.\d+)px') + + def __init__(self, *args, **kwargs): + Parser.__init__(self, *args, **kwargs) + temp = LoggingInterface(logging.getLogger('cssutils')) + temp.setup_cli_handler(self.opts.verbose) + self.css_parser = CSSParser(log=temp.logger, loglevel=logging.ERROR) + self.stylesheet = self.font_css = self.override_css = None def detect_chapters(self): self.detected_chapters = self.opts.chapter(self.root) for elem in self.detected_chapters: text = u' '.join([t.strip() for t in elem.xpath('descendant::text()')]) self.log_info('\tDetected chapter: %s', text[:50]) - if self.opts.chapter_mark in ('both', 'pagebreak'): - style = elem.get('style', '').strip() - if style and not style.endswith(';'): - style += '; ' - style += 'page-break-before: always' - elem.set('style', style) - if self.opts.chapter_mark in ('both', 'rule'): + if self.opts.chapter_mark != 'none': hr = etree.Element('hr') if elem.getprevious() is None: elem.getparent()[:0] = [hr] @@ -466,16 +528,28 @@ class Processor(Parser): insert = i break elem.getparent()[insert:insert] = [hr] + if self.opts.chapter_mark != 'rule': + hr.set('style', 'width:0pt;page-break-before:always') + if self.opts.chapter_mark == 'both': + hr2 = etree.Element('hr') + hr2.tail = u'\u00a0' + p = hr.getparent() + i = p.index(hr) + p[i:i] = [hr2] + def save(self): - style_path = os.path.basename(self.save_path())+'.css' - style = etree.SubElement(self.head, 'link', attrib={'type':'text/css', 'rel':'stylesheet', - 'href':'resources/'+style_path, - 'charset':'UTF-8'}) - style.tail = '\n' - style_path = os.path.join(os.path.dirname(self.save_path()), 'resources', style_path) - open(style_path, 'wb').write(self.css.encode('utf-8')) + style_path = os.path.splitext(os.path.basename(self.save_path()))[0] + for i, sheet in enumerate([self.stylesheet, self.font_css, self.override_css]): + if sheet is not None: + style = etree.SubElement(self.head, 'link', attrib={'type':'text/css', 'rel':'stylesheet', + 'href':'resources/%s_%d.css'%(style_path, i), + 'charset':'UTF-8'}) + style.tail = '\n' + path = os.path.join(os.path.dirname(self.save_path()), *(style.get('href').split('/'))) + self.resource_map[path] = style.get('href') + open(path, 'wb').write(getattr(sheet, 'cssText', sheet).encode('utf-8')) return Parser.save(self) def populate_toc(self, toc): @@ -491,14 +565,45 @@ class Processor(Parser): text = text[:50] + u'\u2026' return target.add_item(href, fragment, text, type=type) - # Add chapters to TOC + name = self.htmlfile_map[self.htmlfile.path] + href = 'content/'+name + + # Add level 1 and level 2 TOC items counter = 0 + if self.opts.level1_toc is not None: + level1 = self.opts.level1_toc(self.root) + if level1: + added = {} + for elem in level1: + text = (u''.join(elem.xpath('string()'))).strip() + if text: + id = elem.get('id', 'calibre_chapter_%d'%counter) + counter += 1 + elem.set('id', id) + added[elem] = add_item(href, id, text, toc, type='chapter') + add_item(href, id, 'Top', added[elem], type='chapter') + if self.opts.level2_toc is not None: + level2 = list(self.opts.level2_toc(self.root)) + for elem in level2: + level1 = None + for item in self.root.iterdescendants(): + if item in added.keys(): + level1 = added[item] + elif item == elem and level1 is not None: + text = (u''.join(elem.xpath('string()'))).strip() + if text: + id = elem.get('id', 'calibre_chapter_%d'%counter) + counter += 1 + elem.set('id', id) + add_item(href, id, text, level1, type='chapter') + + + # Add chapters to TOC + if not self.opts.no_chapters_in_toc: for elem in getattr(self, 'detected_chapters', []): text = (u''.join(elem.xpath('string()'))).strip() if text: - name = self.htmlfile_map[self.htmlfile.path] - href = 'content/'+name counter += 1 id = elem.get('id', 'calibre_chapter_%d'%counter) elem.set('id', id) @@ -518,8 +623,7 @@ class Processor(Parser): pass - name = self.htmlfile_map[self.htmlfile.path] - href = 'content/'+name + if referrer.href != href: # Happens for root file @@ -541,13 +645,24 @@ class Processor(Parser): name = self.htmlfile_map[self.htmlfile.referrer.path] add_item(href, fragment, text, target) - + @classmethod + def preprocess_css(cls, css, dpi=96): + def rescale(match): + val = match.group(1) + try: + val = float(val) + except ValueError: + return '' + return '%fpt'%(72 * val/dpi) - def extract_css(self): + return cls.PIXEL_PAT.sub(rescale, css) + + def extract_css(self, parsed_sheets): ''' - Remove all CSS information from the document and store in self.raw_css. - This includes tags. + Remove all CSS information from the document and store it as + :class:`StyleSheet` objects. ''' + def get_id(chapter, counter, prefix='calibre_css_'): new_id = '%s_%d'%(prefix, counter) if chapter.tag.lower() == 'a' and 'name' in chapter.keys(): @@ -562,17 +677,40 @@ class Processor(Parser): chapter.set('id', id) return id - css = [] + self.external_stylesheets, self.stylesheet = [], self.css_parser.parseString('') for link in self.root.xpath('//link'): if 'css' in link.get('type', 'text/css').lower(): - file = os.path.join(self.tdir, link.get('href', '')) - if file and os.path.exists(file) and os.path.isfile(file): - css.append(open(file, 'rb').read().decode('utf-8')) - link.getparent().remove(link) - + file = os.path.join(self.tdir, *(link.get('href', '').split('/'))) + if file and not 'http:' in file: + if not parsed_sheets.has_key(file): + try: + self.log_info('Processing stylesheet %s...'%file) + css = self.preprocess_css(open(file).read()) + except (IOError, OSError): + self.log_error('Failed to open stylesheet: %s'%file) + else: + try: + parsed_sheets[file] = self.css_parser.parseString(css) + except: + parsed_sheets[file] = css.decode('utf8', 'replace') + self.log_warning('Failed to parse stylesheet: %s'%file) + if self.opts.verbose > 1: + self.log_exception('') + if parsed_sheets.has_key(file): + self.external_stylesheets.append(parsed_sheets[file]) + + for style in self.root.xpath('//style'): if 'css' in style.get('type', 'text/css').lower(): - css.append('\n'.join(style.xpath('./text()'))) + raw = '\n'.join(style.xpath('./text()')) + css = self.preprocess_css(raw) + try: + sheet = self.css_parser.parseString(css) + except: + self.log_debug('Failed to parse style element') + else: + for rule in sheet: + self.stylesheet.add(rule) style.getparent().remove(style) cache = {} @@ -613,57 +751,19 @@ class Processor(Parser): elem.set('class', cn) elem.attrib.pop('style') - for setting, cn in cache.items(): - css.append('.%s {%s}'%(cn, setting)) - - - self.raw_css = '\n\n'.join(css) - self.css = unicode(self.raw_css) + css = '\n'.join(['.%s {%s;}'%(cn, setting) for \ + setting, cn in cache.items()]) + self.stylesheet = self.css_parser.parseString(self.preprocess_css(css)) + css = '' if self.opts.override_css: - self.css += '\n\n'+self.opts.override_css - self.do_layout() - # TODO: Figure out what to do about CSS imports from linked stylesheets - - def relativize_font_sizes(self, dpi=100, base=16): - ''' - Convert all absolute font sizes to percentages of ``base`` using ``dpi`` - to convert from screen to paper units. - :param base: Base size in pixels. Adobe DE seems to need base size to be 16 - irrespective of the unit of the length being converted - :param dpi: Dots per inch used to convert pixels to absolute lengths. Since - most HTML files are created on computers with monitors of DPI ~ 100, we use - 100 by default. - ''' - size_value_pat = re.compile(r'(?[0-9.]+)(?Pcm|mm|in|pt|pc|px)', re.I) + css += '\n\n' + self.opts.override_css + css += '\n\n' + 'body {margin-top: 0pt; margin-bottom: 0pt; margin-left: 0pt; margin-right: 0pt;}' + css += '\n\n@page {margin-top: %fpt; margin-bottom: %fpt; margin-left: %fpt; margin-right: %fpt}'%(self.opts.margin_top, self.opts.margin_bottom, self.opts.margin_left, self.opts.margin_right) + if self.opts.remove_paragraph_spacing: + css += '\n\np {text-indent: 2.1em; margin-top:1pt; margin-bottom:1pt; padding:0pt; border:0pt;}' + self.override_css = self.css_parser.parseString(self.preprocess_css(css)) - # points per unit - ptu = { # Convert to pt - 'px' : 72./dpi, - 'pt' : 1.0, - 'pc' : 1/12., - 'in' : 72., - 'cm' : 72/2.54, - 'mm' : 72/25.4, - } - def relativize(match): - val = float(match.group('num')) - unit = match.group('unit').lower() - val *= ptu[unit] - return '%.1f%%'%((val/base) * 100) - - - def sub(match): - rule = match.group(1) - value = size_value_pat.sub(relativize, match.group(2)) - return '%s : %s'%(rule, value) - - self.css = re.compile(r'(font|font-size)\s*:\s*([^;]+)', re.I).sub(sub, self.css) - - def do_layout(self): - self.css += '\nbody {margin-top: 0pt; margin-bottom: 0pt; margin-left: 0pt; margin-right: 0pt; font-size: %f%%}\n'%self.opts.base_font_size - self.css += '@page {margin-top: %fpt; margin-bottom: %fpt; margin-left: %fpt; margin-right: %fpt}\n'%(self.opts.margin_top, self.opts.margin_bottom, self.opts.margin_left, self.opts.margin_right) - def config(defaults=None, config_name='html', desc=_('Options to control the traversal of HTML')): if defaults is None: diff --git a/src/calibre/gui2/dialogs/epub.py b/src/calibre/gui2/dialogs/epub.py index 9f6dbd6dc6..78a2be0f51 100644 --- a/src/calibre/gui2/dialogs/epub.py +++ b/src/calibre/gui2/dialogs/epub.py @@ -17,6 +17,7 @@ from calibre.ebooks.epub.from_any import SOURCE_FORMATS, config from calibre.ebooks.metadata import MetaInformation from calibre.ptempfile import PersistentTemporaryFile from calibre.ebooks.metadata.opf import OPFCreator +from lxml.etree import XPath class Config(QDialog, Ui_Dialog): @@ -234,6 +235,16 @@ class Config(QDialog, Ui_Dialog): self.source_format = d.format() def accept(self): + for opt in ('chapter', 'level1_toc', 'level2_toc'): + text = unicode(getattr(self, 'opt_'+opt).text()) + if text: + try: + XPath(text,namespaces={'re':'http://exslt.org/regular-expressions'}) + except Exception, err: + error_dialog(self, _('Invalid XPath expression'), + _('The expression %s is invalid. Error: %s')%(text, err) + ).exec_() + return mi = self.get_metadata() self.read_settings() self.cover_file = None diff --git a/src/calibre/gui2/dialogs/epub.ui b/src/calibre/gui2/dialogs/epub.ui index fe4ccdef5d..3ecc0991e8 100644 --- a/src/calibre/gui2/dialogs/epub.ui +++ b/src/calibre/gui2/dialogs/epub.ui @@ -77,7 +77,7 @@ - 1 + 3 @@ -416,29 +416,36 @@ Base &font size: - opt_base_font_size + opt_base_font_size2 - + - % + pt 0 - 10.000000000000000 + 0.000000000000000 - 500.000000000000000 + 30.000000000000000 - 5.000000000000000 + 1.000000000000000 - 100.000000000000000 + 30.000000000000000 + + + + + + + Remove &spacing between paragraphs @@ -674,6 +681,32 @@ p, li { white-space: pre-wrap; } + + + + + + + Level &1 TOC + + + opt_level1_toc + + + + + + + Level &2 TOC + + + opt_level2_toc + + + + + + diff --git a/src/calibre/linux.py b/src/calibre/linux.py index 7a820d3cfa..33796f5b15 100644 --- a/src/calibre/linux.py +++ b/src/calibre/linux.py @@ -295,6 +295,11 @@ complete -o nospace -F _prs500 prs500 ''') f.close() print 'done' + except TypeError, err: + if 'resolve_entities' in str(err): + print 'You need python-lxml >= 2.0.5 for calibre' + sys.exit(1) + raise except: if fatal_errors: raise diff --git a/src/calibre/trac/plugins/download.py b/src/calibre/trac/plugins/download.py index 33049e9dc8..ca5ecabed4 100644 --- a/src/calibre/trac/plugins/download.py +++ b/src/calibre/trac/plugins/download.py @@ -45,7 +45,7 @@ class Distribution(object): INSTALLERS = ('emerge -avn', 'apt-get install', 'yum install') AS_ROOT = (True, False, True) - TITLEMAP = {'gentoo':'Gentoo', 'ubuntu':'Ubuntu Interpid Ibex', + TITLEMAP = {'gentoo':'Gentoo', 'ubuntu':'Ubuntu Intrepid Ibex', 'fedora':'Fedora 10', 'debian':'Debian sid', 'generic': 'Install from source'} MANUAL_MAP = { diff --git a/src/calibre/web/feeds/__init__.py b/src/calibre/web/feeds/__init__.py index 003e9af318..dffb9f8c56 100644 --- a/src/calibre/web/feeds/__init__.py +++ b/src/calibre/web/feeds/__init__.py @@ -5,7 +5,7 @@ __copyright__ = '2008, Kovid Goyal ' ''' Contains the logic for parsing feeds. ''' -import time, logging, traceback +import time, logging, traceback, copy from datetime import datetime from calibre.web.feeds.feedparser import parse @@ -17,7 +17,7 @@ class Article(object): def __init__(self, id, title, url, summary, published, content): self.downloaded = False self.id = id - self.title = title + self.title = title.strip() if title else title self.url = url self.summary = summary self.content = content @@ -38,7 +38,14 @@ Has content : %s def __str__(self): return repr(self) - + + def is_same_as(self, other_article): + #if self.title != getattr(other_article, 'title', False): + # return False + if self.url: + return self.url == getattr(other_article, 'url', False) + return self.content == getattr(other_article, 'content', False) + class Feed(object): @@ -169,7 +176,72 @@ class Feed(object): len(a.summary if a.summary else '')) return length > 2000 * len(self) + + def has_article(self, article): + for a in self: + if a.is_same_as(article): + return True + return False + + def find(self, article): + for i, a in enumerate(self): + if a.is_same_as(article): + return i + return -1 + + def remove(self, article): + i = self.index(article) + if i > -1: + self.articles[i:i+1] = [] +class FeedCollection(list): + + def __init__(self, feeds): + list.__init__(self, [f for f in feeds if len(f.articles) > 0]) + found_articles = set([]) + duplicates = set([]) + + def in_set(s, a): + for x in s: + if a.is_same_as(x): + return x + return None + + print '#feeds', len(self) + print map(len, self) + for f in self: + dups = [] + for a in f: + first = in_set(found_articles, a) + if first is not None: + dups.append(a) + duplicates.add((first, f)) + else: + found_articles.add(a) + for x in dups: + f.articles.remove(x) + + self.duplicates = duplicates + print len(duplicates) + print map(len, self) + #raise + + def find_article(self, article): + for j, f in enumerate(self): + for i, a in enumerate(f): + if a is article: + return (j, i) + + def restore_duplicates(self): + temp = [] + for article, feed in self.duplicates: + art = copy.deepcopy(article) + j, i = self.find_article(article) + art.url = '../feed_%d/article_%d/index.html'%(j, i) + temp.append((feed, art)) + for feed, art in temp: + feed.articles.append(art) + def feed_from_xml(raw_xml, title=None, oldest_article=7, max_articles_per_feed=100, get_article_url=lambda item: item.get('link', None)): diff --git a/src/calibre/web/feeds/news.py b/src/calibre/web/feeds/news.py index 6a9e9acd52..212ca84aac 100644 --- a/src/calibre/web/feeds/news.py +++ b/src/calibre/web/feeds/news.py @@ -289,15 +289,16 @@ class BasicNewsRecipe(object, LoggingInterface): ''' return soup - def postprocess_html(self, soup): + def postprocess_html(self, soup, first_fetch): ''' This method is called with the source of each downloaded :term:`HTML` file, after it is parsed for links and images. It can be used to do arbitrarily powerful post-processing on the :term:`HTML`. It should return `soup` after processing it. - `soup`: A `BeautifulSoup `_ + :param soup: A `BeautifulSoup `_ instance containing the downloaded :term:`HTML`. + :param first_fetch: True if this is the first page of an article. ''' return soup @@ -482,7 +483,7 @@ class BasicNewsRecipe(object, LoggingInterface): elem = BeautifulSoup(templ.render(doctype='xhtml').decode('utf-8')).find('div') body.insert(0, elem) - return self.postprocess_html(soup) + return self.postprocess_html(soup, first_fetch) def download(self): diff --git a/src/calibre/web/feeds/recipes/espn.py b/src/calibre/web/feeds/recipes/espn.py index d8c33847cf..34a1bc609a 100644 --- a/src/calibre/web/feeds/recipes/espn.py +++ b/src/calibre/web/feeds/recipes/espn.py @@ -67,7 +67,7 @@ class ESPN(BasicNewsRecipe): return soup - def postprocess_html(self, soup): + def postprocess_html(self, soup, first_fetch): for div in soup.findAll('div', style=True): div['style'] = div['style'].replace('center', 'left') return soup diff --git a/src/calibre/web/feeds/recipes/newsweek.py b/src/calibre/web/feeds/recipes/newsweek.py index 0da8b8965d..9ad551c469 100644 --- a/src/calibre/web/feeds/recipes/newsweek.py +++ b/src/calibre/web/feeds/recipes/newsweek.py @@ -92,7 +92,7 @@ class Newsweek(BasicNewsRecipe): return sections - def postprocess_html(self, soup): + def postprocess_html(self, soup, first_fetch): divs = list(soup.findAll('div', 'pagination')) if not divs: return diff --git a/src/calibre/web/feeds/recipes/outlook_india.py b/src/calibre/web/feeds/recipes/outlook_india.py index c5782d1536..db8ad900ab 100644 --- a/src/calibre/web/feeds/recipes/outlook_india.py +++ b/src/calibre/web/feeds/recipes/outlook_india.py @@ -73,7 +73,7 @@ class OutlookIndia(BasicNewsRecipe): return feeds - def postprocess_html(self, soup): + def postprocess_html(self, soup, first_fetch): bad = [] for table in soup.findAll('table'): if table.find(text=re.compile(r'\(\d+ of \d+\)')): diff --git a/src/calibre/web/feeds/recipes/scientific_american.py b/src/calibre/web/feeds/recipes/scientific_american.py index b9ca0f131f..7d22013aaf 100644 --- a/src/calibre/web/feeds/recipes/scientific_american.py +++ b/src/calibre/web/feeds/recipes/scientific_american.py @@ -7,14 +7,16 @@ __docformat__ = 'restructuredtext en' sciam.com ''' import re +from lxml import html from calibre.web.feeds.news import BasicNewsRecipe class ScientificAmerican(BasicNewsRecipe): title = u'Scientific American' - description = u'Popular science' + description = u'Popular science. Monthly magazine.' __author__ = 'Kovid Goyal' oldest_article = 30 max_articles_per_feed = 100 + no_stylesheets = True use_embedded_content = False remove_tags_before = dict(name='div', attrs={'class':'headline'}) remove_tags_after = dict(id='article') @@ -26,25 +28,102 @@ class ScientificAmerican(BasicNewsRecipe): html2lrf_options = ['--base-font-size', '8'] recursions = 1 match_regexps = [r'article.cfm.id=\S+page=(2|3|4|5|6|7|8|9|10|11|12|13|14)'] - feeds = [ - (u'Latest News', u'http://rss.sciam.com/ScientificAmerican-News'), - (u'Global', u'http://rss.sciam.com/ScientificAmerican-Global'), - (u'Health', u'http://rss.sciam.com/sciam/health'), - (u'Space', u'http://rss.sciam.com/sciam/space'), - (u'Technology', u'http://rss.sciam.com/sciam/technology'), - (u'Biology', u'http://rss.sciam.com/sciam/biology'), - (u'Mind & Brain', u'http://rss.sciam.com/sciam/mind-and-brain'), - (u"What's Next", u'http://rss.sciam.com/sciam/whats-next'), - (u'Archeology and Paleontology', u'http://www.sciam.com/page.cfm?section=rsscategory&alias=archaeology-and-paleontology'), - (u'Physics', u'http://www.sciam.com/page.cfm?section=rsscategory&alias=physics'), - (u'Math', u'http://rss.sciam.com/sciam/math'), - (u'History of Science', u'http://www.sciam.com/page.cfm?section=rsscategory&alias=history-of-science'), - (u'Chemistry', u'http://rss.sciam.com/sciam/chemistry'), - (u'Mind Matters', u'http://rss.sciam.com/ScientificAmerican-MindBlog') - ] +# feeds = [ +# (u'Latest News', u'http://rss.sciam.com/ScientificAmerican-News'), +# (u'Global', u'http://rss.sciam.com/ScientificAmerican-Global'), +# (u'Health', u'http://rss.sciam.com/sciam/health'), +# (u'Space', u'http://rss.sciam.com/sciam/space'), +# (u'Technology', u'http://rss.sciam.com/sciam/technology'), +# (u'Biology', u'http://rss.sciam.com/sciam/biology'), +# (u'Mind & Brain', u'http://rss.sciam.com/sciam/mind-and-brain'), +# (u"What's Next", u'http://rss.sciam.com/sciam/whats-next'), +# (u'Archeology and Paleontology', u'http://www.sciam.com/page.cfm?section=rsscategory&alias=archaeology-and-paleontology'), +# (u'Physics', u'http://www.sciam.com/page.cfm?section=rsscategory&alias=physics'), +# (u'Math', u'http://rss.sciam.com/sciam/math'), +# (u'History of Science', u'http://www.sciam.com/page.cfm?section=rsscategory&alias=history-of-science'), +# (u'Chemistry', u'http://rss.sciam.com/sciam/chemistry'), +# (u'Mind Matters', u'http://rss.sciam.com/ScientificAmerican-MindBlog') +# ] +# + def parse_index(self): + src = self.browser.open('http://www.sciam.com/sciammag/').read() + root = html.fromstring(src) + self.cover_url = root.xpath('//img[re:match(@src, "cover_")]', + namespaces={'re':'http://exslt.org/regular-expressions'} + )[0].get('src') + self.timefmt = root.xpath('//div[@id = "magazine-month"]')[0].text + feeds = [] + features = [] + for a in root.xpath('//a[@href and @title = "Feature"]'): + if not a.text.strip(): + continue + article = { + 'url' : a.get('href'), + 'title' : u''.join(a.xpath('./text()')), + 'date' : '', + 'description' : '', + } + for s in a.itersiblings('span'): + if s.get('class', '') == 'sub': + article['description'] += u''.join(s.xpath('./text()')) + ' ' + features.append(article) + if features: + feeds.append(('Features', features)) + + departments = [] + for a in root.xpath('//a[@href and @class="title"]'): + txt = u''.join(a.xpath('./text()')).strip() + if not txt: + continue + article = { + 'url' : a.get('href'), + 'title' : txt, + 'date' : '', + 'description' : '', + } + p = a.getparent() + p.remove(a) + article['description'] = u''.join(p.xpath('./text()')) + departments.append(article) + + feeds.append(('Departments', departments)) + opinion = [] + for a in root.xpath('//div[@id = "opinion"]//a[@href]'): + txt = u''.join(a.xpath('./text()')).strip() + if not txt: + continue + article = { + 'url' : a.get('href'), + 'title' : txt, + 'date' : '', + 'description' : '', + } + opinion.append(article) + feeds.append(('Opinion', opinion)) + + ontheweb = [] + for a in root.xpath('//div[@id = "ontheweb"]//a[@href]'): + txt = u''.join(a.xpath('./text()')).strip() + if not txt: + continue + article = { + 'url' : a.get('href'), + 'title' : txt, + 'date' : '', + 'description' : '', + } + ontheweb.append(article) + feeds.append(('On the web', ontheweb)) + + return feeds + - def postprocess_html(self, soup): + def postprocess_html(self, soup, first_fetch): if soup is not None: for span in soup.findAll('span', attrs={'class':'pagination'}): span.extract() + if not first_fetch: + div = soup.find('div', attrs={'class':'headline'}) + if div: + div.extract() return soup diff --git a/src/calibre/web/fetch/simple.py b/src/calibre/web/fetch/simple.py index bd867a2045..c220e8390f 100644 --- a/src/calibre/web/fetch/simple.py +++ b/src/calibre/web/fetch/simple.py @@ -198,7 +198,7 @@ class RecursiveFetcher(object, LoggingInterface): try: f = self.fetch_url(iurl) except Exception, err: - self.log_warning('Could not fetch stylesheet %s', iurl) + self.log_debug('Could not fetch stylesheet %s', iurl) self.log_debug('Error: %s', str(err), exc_info=True) continue stylepath = os.path.join(diskpath, 'style'+str(c)+'.css')