Implemented font size control in EPUB conversion. You can now specify the base font size in absolute units. Remove spacing between paragraphs by default for EPUB output. Can be turned off. Added options for more sophisticated generation of an automatic Table of Contents in EPUB files. Restricted Scientific American recipe to only download articles in current issue.

This commit is contained in:
Kovid Goyal 2008-10-04 13:02:38 -07:00
parent 6fab7e97c3
commit 7fefb01f35
17 changed files with 788 additions and 180 deletions

View File

@ -284,7 +284,7 @@ def english_sort(x, y):
class LoggingInterface:
def __init__(self, logger):
self.__logger = logger
self.__logger = self.logger = logger
def setup_cli_handler(self, verbosity):
for handler in self.__logger.handlers:

View File

@ -7,7 +7,6 @@ __docformat__ = 'restructuredtext en'
Conversion to EPUB.
'''
import sys, textwrap
from lxml import html
from calibre.utils.config import Config, StringConfig
from calibre.utils.zipfile import ZipFile, ZIP_STORED
from calibre.ebooks.html import config as common_config, tostring
@ -16,13 +15,11 @@ class DefaultProfile(object):
flow_size = sys.maxint
screen_size = None
dpi = 100
class PRS505(DefaultProfile):
flow_size = 300000
screen_size = (600, 775)
dpi = 166
PROFILES = {
@ -30,6 +27,13 @@ PROFILES = {
'None' : DefaultProfile,
}
def rules(stylesheets):
for s in stylesheets:
if hasattr(s, 'cssText'):
for r in s:
if r.type == r.STYLE_RULE:
yield r
def initialize_container(path_to_container, opf_name='metadata.opf'):
'''
Create an empty EPUB document, with a default skeleton.
@ -95,6 +99,12 @@ to auto-generate a Table of Contents.
help=_("Don't add auto-detected chapters to the Table of Contents."))
toc('toc_threshold', ['--toc-threshold'], default=6,
help=_('If fewer than this number of chapters is detected, then links are added to the Table of Contents.'))
toc('level1_toc', ['--level1-toc'], default=None,
help=_('XPath expression that specifies all tags that should be added to the Table of Contents at level one. If this is specified, it takes precedence over other forms of auto-detection.'))
toc('level2_toc', ['--level2-toc'], default=None,
help=_('XPath expression that specifies all tags that should be added to the Table of Contents at level two. Each entry is added under the previous level one entry.'))
toc('from_ncx', ['--from-ncx'], default=None,
help=_('Path to a .ncx file that contains the table of contents to use for this ebook. The NCX file should contain links relative to the directory it is placed in. See http://www.niso.org/workrooms/daisy/Z39-86-2005.html#NCX for an overview of the NCX format.'))
toc('use_auto_toc', ['--use-auto-toc'], default=False,
help=_('Normally, if the source file already has a Table of Contents, it is used in preference to the autodetected one. With this option, the autodetected one is always used.'))
@ -107,8 +117,10 @@ to auto-generate a Table of Contents.
help=_('Set the left margin in pts. Default is %default'))
layout('margin_right', ['--margin-right'], default=5.0,
help=_('Set the right margin in pts. Default is %default'))
layout('base_font_size', ['--base-font-size'], default=100.0,
help=_('The base font size as a percentage. Default is %default. Changing this should allow you to control overall base font sizes, except for input HTML files that use absolute font sizes for their text tags.'))
layout('base_font_size2', ['--base-font-size'], default=12.0,
help=_('The base font size in pts. Default is %defaultpt. Set to 0 to disable rescaling of fonts.'))
layout('remove_paragraph_spacing', ['--remove-paragraph-spacing'], default=True,
help=_('Remove spacing between paragraphs. Will not work if the source file forces inter-paragraph spacing.'))
c.add_opt('show_opf', ['--show-opf'], default=False, group='debug',
help=_('Print generated OPF file to stdout'))

View File

@ -0,0 +1,300 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
__docformat__ = 'restructuredtext en'
'''
Font size rationalization. See :function:`relativize`.
'''
import logging, re, operator, functools, collections, unittest, copy, sys
from xml.dom import SyntaxErr
from lxml.cssselect import CSSSelector
from lxml import etree
from lxml.html import HtmlElement
from calibre.ebooks.html import fromstring
from calibre.ebooks.epub import rules
from cssutils import CSSParser
num = r'[-]?\d+|[-]?\d*\.\d+'
length = r'(?P<zero>0)|(?P<num>{num})(?P<unit>%|em|ex|px|in|cm|mm|pt|pc)'.replace('{num}', num)
absolute_size = r'(?P<abs>(x?x-)?(small|large)|medium)'
relative_size = r'(?P<rel>smaller|larger)'
font_size_pat = re.compile('|'.join((relative_size, absolute_size, length)), re.I)
line_height_pat = re.compile(r'({num})(px|in|cm|mm|pt|pc)'.replace('{num}', num))
PTU = {
'in' : 72.,
'cm' : 72/2.54,
'mm' : 72/25.4,
'pt' : 1.0,
'pc' : 1/12.,
}
DEFAULT_FONT_SIZE = 12
class Rationalizer(object):
@classmethod
def specificity(cls, s):
'''Map CSS specificity tuple to a single integer'''
return sum([10**(4-i) + x for i,x in enumerate(s)])
@classmethod
def compute_font_size(cls, elem):
'''
Calculate the effective font size of an element traversing its ancestors as far as
neccessary.
'''
cfs = elem.computed_font_size
if cfs is not None:
return
sfs = elem.specified_font_size
if callable(sfs):
parent = elem.getparent()
cls.compute_font_size(parent)
elem.computed_font_size = sfs(parent.computed_font_size)
else:
elem.computed_font_size = sfs
@classmethod
def calculate_font_size(cls, style):
'Return font size in pts from style object. For relative units returns a callable'
match = font_size_pat.search(style.font)
fs = ''
if match:
fs = match.group()
if style.fontSize:
fs = style.fontSize
match = font_size_pat.search(fs)
if match is None:
return None
match = match.groupdict()
unit = match.get('unit', '')
if unit: unit = unit.lower()
if unit in PTU.keys():
return PTU[unit] * float(match['num'])
if unit in ('em', 'ex'):
return functools.partial(operator.mul, float(match['num']))
if unit == '%':
return functools.partial(operator.mul, float(match['num'])/100.)
abs = match.get('abs', '')
if abs: abs = abs.lower()
if abs:
x = (1.2)**(abs.count('x') * (-1 if 'small' in abs else 1))
return 12 * x
if match.get('zero', False):
return 0.
return functools.partial(operator.mul, 1.2) if 'larger' in fs.lower() else functools.partial(operator.mul, 0.8)
@classmethod
def resolve_rules(cls, stylesheets):
for sheet in stylesheets:
if hasattr(sheet, 'fs_rules'):
continue
sheet.fs_rules = []
sheet.lh_rules = []
for r in sheet:
if r.type == r.STYLE_RULE:
font_size = cls.calculate_font_size(r.style)
if font_size is not None:
for s in r.selectorList:
sheet.fs_rules.append([CSSSelector(s.selectorText), font_size])
orig = line_height_pat.search(r.style.lineHeight)
if orig is not None:
for s in r.selectorList:
sheet.lh_rules.append([CSSSelector(s.selectorText), float(orig.group(1)) * PTU[orig.group(2).lower()]])
@classmethod
def apply_font_size_rules(cls, stylesheets, root):
'Add a ``specified_font_size`` attribute to every element that has a specified font size'
cls.resolve_rules(stylesheets)
for sheet in stylesheets:
for selector, font_size in sheet.fs_rules:
elems = selector(root)
for elem in elems:
elem.specified_font_size = font_size
@classmethod
def remove_font_size_information(cls, stylesheets):
for r in rules(stylesheets):
r.style.removeProperty('font-size')
try:
new = font_size_pat.sub('', r.style.font).strip()
if new:
r.style.font = new
else:
r.style.removeProperty('font')
except SyntaxErr:
r.style.removeProperty('font')
if line_height_pat.search(r.style.lineHeight) is not None:
r.style.removeProperty('line-height')
@classmethod
def compute_font_sizes(cls, root, stylesheets, base=12):
stylesheets = [s for s in stylesheets if hasattr(s, 'cssText')]
cls.apply_font_size_rules(stylesheets, root)
# Compute the effective font size of all tags
root.computed_font_size = DEFAULT_FONT_SIZE
for elem in root.iter(etree.Element):
cls.compute_font_size(elem)
extra_css = {}
if base > 0:
# Calculate the "base" (i.e. most common) font size
font_sizes = collections.defaultdict(lambda : 0)
body = root.xpath('//body')[0]
IGNORE = ('h1', 'h2', 'h3', 'h4', 'h5', 'h6')
for elem in body.iter(etree.Element):
if elem.tag not in IGNORE:
t = getattr(elem, 'text', '')
if t: t = t.strip()
if t:
font_sizes[elem.computed_font_size] += len(t)
t = getattr(elem, 'tail', '')
if t: t = t.strip()
if t:
parent = elem.getparent()
if parent.tag not in IGNORE:
font_sizes[parent.computed_font_size] += len(t)
try:
most_common = max(font_sizes.items(), key=operator.itemgetter(1))[0]
scale = base/most_common if most_common > 0 else 1.
except ValueError:
scale = 1.
# rescale absolute line-heights
counter = 0
for sheet in stylesheets:
for selector, lh in sheet.lh_rules:
for elem in selector(root):
elem.set('id', elem.get('id', 'cfs_%d'%counter))
counter += 1
if not extra_css.has_key(elem.get('id')):
extra_css[elem.get('id')] = []
extra_css[elem.get('id')].append('line-height:%fpt'%(lh*scale))
# Rescale all computed font sizes
for elem in body.iter(etree.Element):
if isinstance(elem, HtmlElement):
elem.computed_font_size *= scale
# Remove all font size specifications from the last stylesheet
cls.remove_font_size_information(stylesheets[-1:])
# Create the CSS to implement the rescaled font sizes
for elem in body.iter(etree.Element):
cfs, pcfs = map(operator.attrgetter('computed_font_size'), (elem, elem.getparent()))
if abs(cfs-pcfs) > 1/12. and abs(pcfs) > 1/12.:
elem.set('id', elem.get('id', 'cfs_%d'%counter))
counter += 1
if not extra_css.has_key(elem.get('id')):
extra_css[elem.get('id')] = []
extra_css[elem.get('id')].append('font-size: %f%%'%(100*(cfs/pcfs)))
css = CSSParser(loglevel=logging.ERROR).parseString('')
for id, r in extra_css.items():
css.add('#%s {%s}'%(id, ';'.join(r)))
return css
@classmethod
def rationalize(cls, stylesheets, root, opts):
logger = logging.getLogger('html2epub')
logger.info('\t\tRationalizing fonts...')
extra_css = None
if opts.base_font_size2 > 0:
try:
extra_css = cls.compute_font_sizes(root, stylesheets, base=opts.base_font_size2)
except:
logger.warning('Failed to rationalize font sizes.')
if opts.verbose > 1:
logger.exception('')
finally:
root.remove_font_size_information()
logger.debug('\t\tDone rationalizing')
return extra_css
################################################################################
############## Testing
################################################################################
class FontTest(unittest.TestCase):
def setUp(self):
from calibre.ebooks.epub import config
self.opts = config(defaults='').parse()
self.html = '''
<html>
<head>
<title>Test document</title>
</head>
<body>
<div id="div1">
<!-- A comment -->
<p id="p1">Some <b>text</b></p>
</div>
<p id="p2">Some other <span class="it">text</span>.</p>
<p id="longest">The longest piece of single font size text in this entire file. Used to test resizing.</p>
</body>
</html>
'''
self.root = fromstring(self.html)
def do_test(self, css, base=DEFAULT_FONT_SIZE, scale=1):
root1 = copy.deepcopy(self.root)
root1.computed_font_size = DEFAULT_FONT_SIZE
stylesheet = CSSParser(loglevel=logging.ERROR).parseString(css)
stylesheet2 = Rationalizer.compute_font_sizes(root1, [stylesheet], base)
root2 = copy.deepcopy(root1)
root2.remove_font_size_information()
root2.computed_font_size = DEFAULT_FONT_SIZE
Rationalizer.apply_font_size_rules([stylesheet2], root2)
for elem in root2.iter(etree.Element):
Rationalizer.compute_font_size(elem)
for e1, e2 in zip(root1.xpath('//body')[0].iter(etree.Element), root2.xpath('//body')[0].iter(etree.Element)):
self.assertAlmostEqual(e1.computed_font_size, e2.computed_font_size,
msg='Computed font sizes for %s not equal. Original: %f Processed: %f'%\
(root1.getroottree().getpath(e1), e1.computed_font_size, e2.computed_font_size))
return stylesheet2.cssText
def testStripping(self):
'Test that any original entries are removed from the CSS'
css = 'p { font: bold 10px italic smaller; font-size: x-large} \na { font-size: 0 }'
css = CSSParser(loglevel=logging.ERROR).parseString(css)
Rationalizer.compute_font_sizes(copy.deepcopy(self.root), [css])
self.assertEqual(css.cssText.replace(' ', '').replace('\n', ''),
'p{font:bolditalic}')
def testIdentity(self):
'Test that no unnecessary font size changes are made'
extra_css = self.do_test('div {font-size:12pt} \nspan {font-size:100%}')
self.assertEqual(extra_css.strip(), '')
def testRelativization(self):
'Test conversion of absolute to relative sizes'
self.do_test('#p1 {font: 24pt} b {font: 12pt} .it {font: 48pt} #p2 {font: 100%}')
def testResizing(self):
'Test resizing of fonts'
self.do_test('#longest {font: 24pt} .it {font:20pt; line-height:22pt}')
def suite():
return unittest.TestLoader().loadTestsFromTestCase(FontTest)
def test():
unittest.TextTestRunner(verbosity=2).run(suite())
if __name__ == '__main__':
sys.exit(test())

View File

@ -32,8 +32,7 @@ Conversion of HTML/OPF files follows several stages:
* The EPUB container is created.
'''
import os, sys, re, cStringIO, logging
from contextlib import nested
import os, sys, cStringIO, logging
from lxml.etree import XPath
try:
@ -41,7 +40,7 @@ try:
except ImportError:
import Image as PILImage
from calibre.ebooks.html import Processor, get_text, merge_metadata, get_filelist,\
from calibre.ebooks.html import Processor, merge_metadata, get_filelist,\
opf_traverse, create_metadata, rebase_toc
from calibre.ebooks.epub import config as common_config
from calibre.ptempfile import TemporaryDirectory
@ -50,21 +49,23 @@ from calibre.ebooks.metadata.toc import TOC
from calibre.ebooks.metadata.opf2 import OPF
from calibre.ebooks.epub import initialize_container, PROFILES
from calibre.ebooks.epub.split import split
from calibre.ebooks.epub.fonts import Rationalizer
from calibre.constants import preferred_encoding
class HTMLProcessor(Processor):
class HTMLProcessor(Processor, Rationalizer):
def __init__(self, htmlfile, opts, tdir, resource_map, htmlfiles):
def __init__(self, htmlfile, opts, tdir, resource_map, htmlfiles, stylesheets):
Processor.__init__(self, htmlfile, opts, tdir, resource_map, htmlfiles,
name='html2epub')
name='html2epub')
if opts.verbose > 2:
self.debug_tree('parsed')
self.detect_chapters()
self.extract_css()
self.relativize_font_sizes()
self.extract_css(stylesheets)
if self.opts.base_font_size2 > 0:
self.font_css = self.rationalize(self.external_stylesheets+[self.stylesheet],
self.root, self.opts)
if opts.verbose > 2:
self.debug_tree('nocss')
@ -73,19 +74,6 @@ class HTMLProcessor(Processor):
meta.getparent().remove(meta)
Processor.save(self)
#self.collect_font_statistics()
def collect_font_statistics(self):
'''
Collect font statistics to figure out the base font size used in this
HTML document.
'''
self.font_statistics = {} #: A mapping of font size (in pts) to number of characters rendered at that font size
for text in get_text(self.body if self.body is not None else self.root):
length, parent = len(re.sub(r'\s+', '', text)), text.getparent()
#TODO: Use cssutils on self.raw_css to figure out the font size
# of this piece of text and update statistics accordingly
@ -104,21 +92,30 @@ the <spine> element of the OPF file.
def parse_content(filelist, opts, tdir):
os.makedirs(os.path.join(tdir, 'content', 'resources'))
resource_map = {}
resource_map, stylesheets = {}, {}
toc = TOC(base_path=tdir, type='root')
stylesheet_map = {}
for htmlfile in filelist:
logging.getLogger('html2epub').debug('Processing %s...'%htmlfile)
hp = HTMLProcessor(htmlfile, opts, os.path.join(tdir, 'content'),
resource_map, filelist)
resource_map, filelist, stylesheets)
hp.populate_toc(toc)
hp.save()
stylesheet_map[os.path.basename(hp.save_path())] = \
[s for s in hp.external_stylesheets + [hp.stylesheet, hp.font_css, hp.override_css] if s is not None]
logging.getLogger('html2epub').debug('Saving stylesheets...')
if opts.base_font_size2 > 0:
Rationalizer.remove_font_size_information(stylesheets.values())
for path, css in stylesheets.items():
open(path, 'wb').write(getattr(css, 'cssText', css).encode('utf-8'))
if toc.count('chapter') > opts.toc_threshold:
toc.purge(['file', 'link', 'unknown'])
if toc.count('chapter') + toc.count('file') > opts.toc_threshold:
toc.purge(['link', 'unknown'])
toc.purge(['link'], max=opts.max_toc_links)
return resource_map, hp.htmlfile_map, toc
return resource_map, hp.htmlfile_map, toc, stylesheet_map
def resize_cover(im, opts):
width, height = im.size
@ -176,7 +173,7 @@ def process_title_page(mi, filelist, htmlfilemap, opts, tdir):
<title>Cover</title>
<style type="text/css">@page {padding: 0pt; margin:0pt}</style>
</head>
<body style="padding: 0pt; margin: 0pt;}">
<body style="padding: 0pt; margin: 0pt">
<div style="text-align:center">
<img style="text-align: center" src="%s" alt="cover" />
</div>
@ -212,11 +209,22 @@ def convert(htmlfile, opts, notification=None):
mi = merge_metadata(htmlfile, opf, opts)
opts.chapter = XPath(opts.chapter,
namespaces={'re':'http://exslt.org/regular-expressions'})
if opts.level1_toc:
opts.level1_toc = XPath(opts.level1_toc,
namespaces={'re':'http://exslt.org/regular-expressions'})
else:
opts.level1_toc = None
if opts.level2_toc:
opts.level2_toc = XPath(opts.level2_toc,
namespaces={'re':'http://exslt.org/regular-expressions'})
else:
opts.level2_toc = None
with TemporaryDirectory(suffix='_html2epub', keep=opts.keep_intermediate) as tdir:
if opts.keep_intermediate:
print 'Intermediate files in', tdir
resource_map, htmlfile_map, generated_toc = parse_content(filelist, opts, tdir)
resource_map, htmlfile_map, generated_toc, stylesheet_map = \
parse_content(filelist, opts, tdir)
logger = logging.getLogger('html2epub')
resources = [os.path.join(tdir, 'content', f) for f in resource_map.values()]
@ -235,6 +243,10 @@ def convert(htmlfile, opts, notification=None):
rebase_toc(mi.toc, htmlfile_map, tdir)
if opts.use_auto_toc or mi.toc is None or len(list(mi.toc.flat())) < 2:
mi.toc = generated_toc
if opts.from_ncx:
toc = TOC()
toc.read_ncx_toc(opts.from_ncx)
mi.toc = toc
for item in mi.manifest:
if getattr(item, 'mime_type', None) == 'text/html':
item.mime_type = 'application/xhtml+xml'
@ -247,7 +259,7 @@ def convert(htmlfile, opts, notification=None):
f.write(toc)
if opts.show_ncx:
print toc
split(opf_path, opts)
split(opf_path, opts, stylesheet_map)
opf = OPF(opf_path, tdir)
opf.remove_guide()
if has_title_page:

View File

@ -12,10 +12,9 @@ import os, math, logging, functools, collections, re, copy
from lxml.etree import XPath as _XPath
from lxml import etree, html
from lxml.cssselect import CSSSelector
from cssutils import CSSParser
from calibre.ebooks.metadata.opf2 import OPF
from calibre.ebooks.epub import tostring
from calibre.ebooks.epub import tostring, rules
from calibre import CurrentDir, LoggingInterface
XPath = functools.partial(_XPath, namespaces={'re':'http://exslt.org/regular-expressions'})
@ -35,7 +34,7 @@ class SplitError(ValueError):
class Splitter(LoggingInterface):
def __init__(self, path, opts, always_remove=False):
def __init__(self, path, opts, stylesheet_map, always_remove=False):
LoggingInterface.__init__(self, logging.getLogger('htmlsplit'))
self.setup_cli_handler(opts.verbose)
self.path = path
@ -46,22 +45,8 @@ class Splitter(LoggingInterface):
self.log_info('\tSplitting %s (%d KB)', path, self.orig_size/1024.)
root = html.fromstring(open(content(path)).read())
css = XPath('//link[@type = "text/css" and @rel = "stylesheet"]')(root)
if css:
cssp = os.path.join('content', *(css[0].get('href').split('/')))
self.log_debug('\t\tParsing stylesheet...')
try:
stylesheet = CSSParser().parseString(open(cssp, 'rb').read())
except:
self.log_warn('Failed to parse CSS. Splitting on page-breaks is disabled')
if self.opts.verbose > 1:
self.log_exception('')
stylesheet = None
else:
stylesheet = None
self.page_breaks = []
if stylesheet is not None:
self.find_page_breaks(stylesheet, root)
self.find_page_breaks(stylesheet_map[self.path], root)
self.trees = []
self.split_size = 0
@ -189,14 +174,12 @@ class Splitter(LoggingInterface):
self.split(t)
def find_page_breaks(self, stylesheet, root):
def find_page_breaks(self, stylesheets, root):
'''
Find all elements that have either page-break-before or page-break-after set.
'''
page_break_selectors = set([])
for rule in stylesheet:
if rule.type != rule.STYLE_RULE:
continue
for rule in rules(stylesheets):
before = getattr(rule.style.getPropertyCSSValue('page-break-before'), 'cssText', '').strip().lower()
after = getattr(rule.style.getPropertyCSSValue('page-break-after'), 'cssText', '').strip().lower()
try:
@ -385,7 +368,7 @@ def fix_ncx(path, changes):
if changed:
open(path, 'wb').write(etree.tostring(tree.getroot(), encoding='UTF-8', xml_declaration=True))
def split(pathtoopf, opts):
def split(pathtoopf, opts, stylesheet_map):
pathtoopf = os.path.abspath(pathtoopf)
with CurrentDir(os.path.dirname(pathtoopf)):
opf = OPF(open(pathtoopf, 'rb'), os.path.dirname(pathtoopf))
@ -403,7 +386,7 @@ def split(pathtoopf, opts):
for f in html_files:
if os.stat(content(f)).st_size > opts.profile.flow_size:
try:
changes.append(Splitter(f, opts,
changes.append(Splitter(f, opts, stylesheet_map,
always_remove=(always_remove or \
os.stat(content(f)).st_size > 5*opts.profile.flow_size)))
except (SplitError, RuntimeError):

View File

@ -8,12 +8,14 @@ Code to recursively parse HTML files and create an open ebook in a specified
directory or zip file. All the action starts in :function:`create_dir`.
'''
import sys, re, os, shutil, logging, tempfile, cStringIO
import sys, re, os, shutil, logging, tempfile, cStringIO, operator, functools
from urlparse import urlparse
from urllib import unquote
from lxml import html, etree
from lxml.html import soupparser
from lxml import etree
from lxml.html import HtmlElementClassLookup, HTMLParser as _HTMLParser, \
fromstring as _fromstring, tostring as _tostring, \
soupparser, HtmlElement
from lxml.etree import XPath
get_text = XPath("//text()")
@ -25,9 +27,67 @@ from calibre.ebooks.metadata.meta import get_metadata
from calibre.ebooks.metadata.opf2 import OPF, OPFCreator
from calibre.ptempfile import PersistentTemporaryDirectory, PersistentTemporaryFile
from calibre.utils.zipfile import ZipFile
from cssutils import CSSParser
class HTMLElement(HtmlElement):
@apply
def specified_font_size():
def fget(self):
ans = self.get('specified_font_size', '')
if not ans:
return lambda x: x
if ans.startswith('f'):
return functools.partial(operator.mul, float(ans[1:]))
return float(ans)
def fset(self, val):
self.set('specified_font_size', ('f'+repr(val(1))) if callable(val) else repr(val))
return property(fget=fget, fset=fset)
@apply
def computed_font_size():
def fget(self):
ans = self.get('computed_font_size', '')
if ans == '':
return None
return float(ans)
def fset(self, val):
self.set('computed_font_size', repr(val))
return property(fget=fget, fset=fset)
def remove_font_size_information(self):
for elem in self.iter():
for p in ('computed', 'specified'):
elem.attrib.pop(p+'_font_size', None)
def getpath(self):
return self.getroottree().getpath(self)
class Lookup(HtmlElementClassLookup):
def lookup(self, node_type, document, namespace, name):
if node_type == 'element':
return HTMLElement
return HtmlElementClassLookup.lookup(self, node_type, document, namespace, name)
class HTMLParser(_HTMLParser):
def __init__(self, **kwargs):
super(HTMLParser, self).__init__(**kwargs)
self.set_element_class_lookup(Lookup())
parser = HTMLParser()
def fromstring(raw, **kw):
return _fromstring(raw, parser=parser, **kw)
def tostring(root, pretty_print=False):
return html.tostring(root, encoding='utf-8', method='xml',
return _tostring(root, encoding='utf-8', method='xml',
include_meta_content_type=True,
pretty_print=pretty_print)
@ -372,11 +432,11 @@ class Parser(PreProcessor, LoggingInterface):
for pat in ENCODING_PATS:
src = pat.sub('', src)
try:
self.root = html.fromstring(src)
self.root = fromstring(src)
except:
if self.opts.verbose:
self.log_exception('lxml based parsing failed')
self.root = soupparser.fromstring(src)
self.root = soupparser.fromstring(src, makeelement=parser.makeelement)
head = self.root.xpath('./head')
if head:
head = head[0]
@ -402,7 +462,7 @@ class Parser(PreProcessor, LoggingInterface):
os.makedirs(tdir)
with open(os.path.join(tdir, '%s-%s.html'%\
(os.path.basename(self.htmlfile.path), name)), 'wb') as f:
f.write(html.tostring(self.root, encoding='utf-8'))
f.write(tostring(self.root, encoding='utf-8'))
self.log_debug(_('Written processed HTML to ')+f.name)
@ -443,19 +503,21 @@ class Processor(Parser):
'''
LINKS_PATH = XPath('//a[@href]')
PIXEL_PAT = re.compile(r'([-]?\d+|[-]?\d*\.\d+)px')
def __init__(self, *args, **kwargs):
Parser.__init__(self, *args, **kwargs)
temp = LoggingInterface(logging.getLogger('cssutils'))
temp.setup_cli_handler(self.opts.verbose)
self.css_parser = CSSParser(log=temp.logger, loglevel=logging.ERROR)
self.stylesheet = self.font_css = self.override_css = None
def detect_chapters(self):
self.detected_chapters = self.opts.chapter(self.root)
for elem in self.detected_chapters:
text = u' '.join([t.strip() for t in elem.xpath('descendant::text()')])
self.log_info('\tDetected chapter: %s', text[:50])
if self.opts.chapter_mark in ('both', 'pagebreak'):
style = elem.get('style', '').strip()
if style and not style.endswith(';'):
style += '; '
style += 'page-break-before: always'
elem.set('style', style)
if self.opts.chapter_mark in ('both', 'rule'):
if self.opts.chapter_mark != 'none':
hr = etree.Element('hr')
if elem.getprevious() is None:
elem.getparent()[:0] = [hr]
@ -466,16 +528,28 @@ class Processor(Parser):
insert = i
break
elem.getparent()[insert:insert] = [hr]
if self.opts.chapter_mark != 'rule':
hr.set('style', 'width:0pt;page-break-before:always')
if self.opts.chapter_mark == 'both':
hr2 = etree.Element('hr')
hr2.tail = u'\u00a0'
p = hr.getparent()
i = p.index(hr)
p[i:i] = [hr2]
def save(self):
style_path = os.path.basename(self.save_path())+'.css'
style = etree.SubElement(self.head, 'link', attrib={'type':'text/css', 'rel':'stylesheet',
'href':'resources/'+style_path,
'charset':'UTF-8'})
style.tail = '\n'
style_path = os.path.join(os.path.dirname(self.save_path()), 'resources', style_path)
open(style_path, 'wb').write(self.css.encode('utf-8'))
style_path = os.path.splitext(os.path.basename(self.save_path()))[0]
for i, sheet in enumerate([self.stylesheet, self.font_css, self.override_css]):
if sheet is not None:
style = etree.SubElement(self.head, 'link', attrib={'type':'text/css', 'rel':'stylesheet',
'href':'resources/%s_%d.css'%(style_path, i),
'charset':'UTF-8'})
style.tail = '\n'
path = os.path.join(os.path.dirname(self.save_path()), *(style.get('href').split('/')))
self.resource_map[path] = style.get('href')
open(path, 'wb').write(getattr(sheet, 'cssText', sheet).encode('utf-8'))
return Parser.save(self)
def populate_toc(self, toc):
@ -491,14 +565,45 @@ class Processor(Parser):
text = text[:50] + u'\u2026'
return target.add_item(href, fragment, text, type=type)
# Add chapters to TOC
name = self.htmlfile_map[self.htmlfile.path]
href = 'content/'+name
# Add level 1 and level 2 TOC items
counter = 0
if self.opts.level1_toc is not None:
level1 = self.opts.level1_toc(self.root)
if level1:
added = {}
for elem in level1:
text = (u''.join(elem.xpath('string()'))).strip()
if text:
id = elem.get('id', 'calibre_chapter_%d'%counter)
counter += 1
elem.set('id', id)
added[elem] = add_item(href, id, text, toc, type='chapter')
add_item(href, id, 'Top', added[elem], type='chapter')
if self.opts.level2_toc is not None:
level2 = list(self.opts.level2_toc(self.root))
for elem in level2:
level1 = None
for item in self.root.iterdescendants():
if item in added.keys():
level1 = added[item]
elif item == elem and level1 is not None:
text = (u''.join(elem.xpath('string()'))).strip()
if text:
id = elem.get('id', 'calibre_chapter_%d'%counter)
counter += 1
elem.set('id', id)
add_item(href, id, text, level1, type='chapter')
# Add chapters to TOC
if not self.opts.no_chapters_in_toc:
for elem in getattr(self, 'detected_chapters', []):
text = (u''.join(elem.xpath('string()'))).strip()
if text:
name = self.htmlfile_map[self.htmlfile.path]
href = 'content/'+name
counter += 1
id = elem.get('id', 'calibre_chapter_%d'%counter)
elem.set('id', id)
@ -518,8 +623,7 @@ class Processor(Parser):
pass
name = self.htmlfile_map[self.htmlfile.path]
href = 'content/'+name
if referrer.href != href: # Happens for root file
@ -541,13 +645,24 @@ class Processor(Parser):
name = self.htmlfile_map[self.htmlfile.referrer.path]
add_item(href, fragment, text, target)
@classmethod
def preprocess_css(cls, css, dpi=96):
def rescale(match):
val = match.group(1)
try:
val = float(val)
except ValueError:
return ''
return '%fpt'%(72 * val/dpi)
def extract_css(self):
return cls.PIXEL_PAT.sub(rescale, css)
def extract_css(self, parsed_sheets):
'''
Remove all CSS information from the document and store in self.raw_css.
This includes <font> tags.
Remove all CSS information from the document and store it as
:class:`StyleSheet` objects.
'''
def get_id(chapter, counter, prefix='calibre_css_'):
new_id = '%s_%d'%(prefix, counter)
if chapter.tag.lower() == 'a' and 'name' in chapter.keys():
@ -562,17 +677,40 @@ class Processor(Parser):
chapter.set('id', id)
return id
css = []
self.external_stylesheets, self.stylesheet = [], self.css_parser.parseString('')
for link in self.root.xpath('//link'):
if 'css' in link.get('type', 'text/css').lower():
file = os.path.join(self.tdir, link.get('href', ''))
if file and os.path.exists(file) and os.path.isfile(file):
css.append(open(file, 'rb').read().decode('utf-8'))
link.getparent().remove(link)
file = os.path.join(self.tdir, *(link.get('href', '').split('/')))
if file and not 'http:' in file:
if not parsed_sheets.has_key(file):
try:
self.log_info('Processing stylesheet %s...'%file)
css = self.preprocess_css(open(file).read())
except (IOError, OSError):
self.log_error('Failed to open stylesheet: %s'%file)
else:
try:
parsed_sheets[file] = self.css_parser.parseString(css)
except:
parsed_sheets[file] = css.decode('utf8', 'replace')
self.log_warning('Failed to parse stylesheet: %s'%file)
if self.opts.verbose > 1:
self.log_exception('')
if parsed_sheets.has_key(file):
self.external_stylesheets.append(parsed_sheets[file])
for style in self.root.xpath('//style'):
if 'css' in style.get('type', 'text/css').lower():
css.append('\n'.join(style.xpath('./text()')))
raw = '\n'.join(style.xpath('./text()'))
css = self.preprocess_css(raw)
try:
sheet = self.css_parser.parseString(css)
except:
self.log_debug('Failed to parse style element')
else:
for rule in sheet:
self.stylesheet.add(rule)
style.getparent().remove(style)
cache = {}
@ -613,57 +751,19 @@ class Processor(Parser):
elem.set('class', cn)
elem.attrib.pop('style')
for setting, cn in cache.items():
css.append('.%s {%s}'%(cn, setting))
self.raw_css = '\n\n'.join(css)
self.css = unicode(self.raw_css)
css = '\n'.join(['.%s {%s;}'%(cn, setting) for \
setting, cn in cache.items()])
self.stylesheet = self.css_parser.parseString(self.preprocess_css(css))
css = ''
if self.opts.override_css:
self.css += '\n\n'+self.opts.override_css
self.do_layout()
# TODO: Figure out what to do about CSS imports from linked stylesheets
def relativize_font_sizes(self, dpi=100, base=16):
'''
Convert all absolute font sizes to percentages of ``base`` using ``dpi``
to convert from screen to paper units.
:param base: Base size in pixels. Adobe DE seems to need base size to be 16
irrespective of the unit of the length being converted
:param dpi: Dots per inch used to convert pixels to absolute lengths. Since
most HTML files are created on computers with monitors of DPI ~ 100, we use
100 by default.
'''
size_value_pat = re.compile(r'(?<!/)(?P<num>[0-9.]+)(?P<unit>cm|mm|in|pt|pc|px)', re.I)
css += '\n\n' + self.opts.override_css
css += '\n\n' + 'body {margin-top: 0pt; margin-bottom: 0pt; margin-left: 0pt; margin-right: 0pt;}'
css += '\n\n@page {margin-top: %fpt; margin-bottom: %fpt; margin-left: %fpt; margin-right: %fpt}'%(self.opts.margin_top, self.opts.margin_bottom, self.opts.margin_left, self.opts.margin_right)
if self.opts.remove_paragraph_spacing:
css += '\n\np {text-indent: 2.1em; margin-top:1pt; margin-bottom:1pt; padding:0pt; border:0pt;}'
self.override_css = self.css_parser.parseString(self.preprocess_css(css))
# points per unit
ptu = { # Convert to pt
'px' : 72./dpi,
'pt' : 1.0,
'pc' : 1/12.,
'in' : 72.,
'cm' : 72/2.54,
'mm' : 72/25.4,
}
def relativize(match):
val = float(match.group('num'))
unit = match.group('unit').lower()
val *= ptu[unit]
return '%.1f%%'%((val/base) * 100)
def sub(match):
rule = match.group(1)
value = size_value_pat.sub(relativize, match.group(2))
return '%s : %s'%(rule, value)
self.css = re.compile(r'(font|font-size)\s*:\s*([^;]+)', re.I).sub(sub, self.css)
def do_layout(self):
self.css += '\nbody {margin-top: 0pt; margin-bottom: 0pt; margin-left: 0pt; margin-right: 0pt; font-size: %f%%}\n'%self.opts.base_font_size
self.css += '@page {margin-top: %fpt; margin-bottom: %fpt; margin-left: %fpt; margin-right: %fpt}\n'%(self.opts.margin_top, self.opts.margin_bottom, self.opts.margin_left, self.opts.margin_right)
def config(defaults=None, config_name='html',
desc=_('Options to control the traversal of HTML')):
if defaults is None:

View File

@ -17,6 +17,7 @@ from calibre.ebooks.epub.from_any import SOURCE_FORMATS, config
from calibre.ebooks.metadata import MetaInformation
from calibre.ptempfile import PersistentTemporaryFile
from calibre.ebooks.metadata.opf import OPFCreator
from lxml.etree import XPath
class Config(QDialog, Ui_Dialog):
@ -234,6 +235,16 @@ class Config(QDialog, Ui_Dialog):
self.source_format = d.format()
def accept(self):
for opt in ('chapter', 'level1_toc', 'level2_toc'):
text = unicode(getattr(self, 'opt_'+opt).text())
if text:
try:
XPath(text,namespaces={'re':'http://exslt.org/regular-expressions'})
except Exception, err:
error_dialog(self, _('Invalid XPath expression'),
_('The expression %s is invalid. Error: %s')%(text, err)
).exec_()
return
mi = self.get_metadata()
self.read_settings()
self.cover_file = None

View File

@ -77,7 +77,7 @@
<item>
<widget class="QStackedWidget" name="stack" >
<property name="currentIndex" >
<number>1</number>
<number>3</number>
</property>
<widget class="QWidget" name="metadata_page" >
<layout class="QGridLayout" name="gridLayout_4" >
@ -416,29 +416,36 @@
<string>Base &amp;font size:</string>
</property>
<property name="buddy" >
<cstring>opt_base_font_size</cstring>
<cstring>opt_base_font_size2</cstring>
</property>
</widget>
</item>
<item row="1" column="2" >
<widget class="QDoubleSpinBox" name="opt_base_font_size" >
<widget class="QDoubleSpinBox" name="opt_base_font_size2" >
<property name="suffix" >
<string> %</string>
<string> pt</string>
</property>
<property name="decimals" >
<number>0</number>
</property>
<property name="minimum" >
<double>10.000000000000000</double>
<double>0.000000000000000</double>
</property>
<property name="maximum" >
<double>500.000000000000000</double>
<double>30.000000000000000</double>
</property>
<property name="singleStep" >
<double>5.000000000000000</double>
<double>1.000000000000000</double>
</property>
<property name="value" >
<double>100.000000000000000</double>
<double>30.000000000000000</double>
</property>
</widget>
</item>
<item row="2" column="0" >
<widget class="QCheckBox" name="opt_remove_paragraph_spacing" >
<property name="text" >
<string>Remove &amp;spacing between paragraphs</string>
</property>
</widget>
</item>
@ -674,6 +681,32 @@ p, li { white-space: pre-wrap; }
</property>
</widget>
</item>
<item row="4" column="1" >
<widget class="QLineEdit" name="opt_level1_toc" />
</item>
<item row="4" column="0" >
<widget class="QLabel" name="label_19" >
<property name="text" >
<string>Level &amp;1 TOC</string>
</property>
<property name="buddy" >
<cstring>opt_level1_toc</cstring>
</property>
</widget>
</item>
<item row="5" column="0" >
<widget class="QLabel" name="label_20" >
<property name="text" >
<string>Level &amp;2 TOC</string>
</property>
<property name="buddy" >
<cstring>opt_level2_toc</cstring>
</property>
</widget>
</item>
<item row="5" column="1" >
<widget class="QLineEdit" name="opt_level2_toc" />
</item>
</layout>
</widget>
</item>

View File

@ -295,6 +295,11 @@ complete -o nospace -F _prs500 prs500
''')
f.close()
print 'done'
except TypeError, err:
if 'resolve_entities' in str(err):
print 'You need python-lxml >= 2.0.5 for calibre'
sys.exit(1)
raise
except:
if fatal_errors:
raise

View File

@ -45,7 +45,7 @@ class Distribution(object):
INSTALLERS = ('emerge -avn', 'apt-get install', 'yum install')
AS_ROOT = (True, False, True)
TITLEMAP = {'gentoo':'Gentoo', 'ubuntu':'Ubuntu Interpid Ibex',
TITLEMAP = {'gentoo':'Gentoo', 'ubuntu':'Ubuntu Intrepid Ibex',
'fedora':'Fedora 10', 'debian':'Debian sid', 'generic': 'Install from source'}
MANUAL_MAP = {

View File

@ -5,7 +5,7 @@ __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
'''
Contains the logic for parsing feeds.
'''
import time, logging, traceback
import time, logging, traceback, copy
from datetime import datetime
from calibre.web.feeds.feedparser import parse
@ -17,7 +17,7 @@ class Article(object):
def __init__(self, id, title, url, summary, published, content):
self.downloaded = False
self.id = id
self.title = title
self.title = title.strip() if title else title
self.url = url
self.summary = summary
self.content = content
@ -38,7 +38,14 @@ Has content : %s
def __str__(self):
return repr(self)
def is_same_as(self, other_article):
#if self.title != getattr(other_article, 'title', False):
# return False
if self.url:
return self.url == getattr(other_article, 'url', False)
return self.content == getattr(other_article, 'content', False)
class Feed(object):
@ -169,7 +176,72 @@ class Feed(object):
len(a.summary if a.summary else ''))
return length > 2000 * len(self)
def has_article(self, article):
for a in self:
if a.is_same_as(article):
return True
return False
def find(self, article):
for i, a in enumerate(self):
if a.is_same_as(article):
return i
return -1
def remove(self, article):
i = self.index(article)
if i > -1:
self.articles[i:i+1] = []
class FeedCollection(list):
def __init__(self, feeds):
list.__init__(self, [f for f in feeds if len(f.articles) > 0])
found_articles = set([])
duplicates = set([])
def in_set(s, a):
for x in s:
if a.is_same_as(x):
return x
return None
print '#feeds', len(self)
print map(len, self)
for f in self:
dups = []
for a in f:
first = in_set(found_articles, a)
if first is not None:
dups.append(a)
duplicates.add((first, f))
else:
found_articles.add(a)
for x in dups:
f.articles.remove(x)
self.duplicates = duplicates
print len(duplicates)
print map(len, self)
#raise
def find_article(self, article):
for j, f in enumerate(self):
for i, a in enumerate(f):
if a is article:
return (j, i)
def restore_duplicates(self):
temp = []
for article, feed in self.duplicates:
art = copy.deepcopy(article)
j, i = self.find_article(article)
art.url = '../feed_%d/article_%d/index.html'%(j, i)
temp.append((feed, art))
for feed, art in temp:
feed.articles.append(art)
def feed_from_xml(raw_xml, title=None, oldest_article=7,
max_articles_per_feed=100, get_article_url=lambda item: item.get('link', None)):

View File

@ -289,15 +289,16 @@ class BasicNewsRecipe(object, LoggingInterface):
'''
return soup
def postprocess_html(self, soup):
def postprocess_html(self, soup, first_fetch):
'''
This method is called with the source of each downloaded :term:`HTML` file, after
it is parsed for links and images.
It can be used to do arbitrarily powerful post-processing on the :term:`HTML`.
It should return `soup` after processing it.
`soup`: A `BeautifulSoup <http://www.crummy.com/software/BeautifulSoup/documentation.html>`_
:param soup: A `BeautifulSoup <http://www.crummy.com/software/BeautifulSoup/documentation.html>`_
instance containing the downloaded :term:`HTML`.
:param first_fetch: True if this is the first page of an article.
'''
return soup
@ -482,7 +483,7 @@ class BasicNewsRecipe(object, LoggingInterface):
elem = BeautifulSoup(templ.render(doctype='xhtml').decode('utf-8')).find('div')
body.insert(0, elem)
return self.postprocess_html(soup)
return self.postprocess_html(soup, first_fetch)
def download(self):

View File

@ -67,7 +67,7 @@ class ESPN(BasicNewsRecipe):
return soup
def postprocess_html(self, soup):
def postprocess_html(self, soup, first_fetch):
for div in soup.findAll('div', style=True):
div['style'] = div['style'].replace('center', 'left')
return soup

View File

@ -92,7 +92,7 @@ class Newsweek(BasicNewsRecipe):
return sections
def postprocess_html(self, soup):
def postprocess_html(self, soup, first_fetch):
divs = list(soup.findAll('div', 'pagination'))
if not divs:
return

View File

@ -73,7 +73,7 @@ class OutlookIndia(BasicNewsRecipe):
return feeds
def postprocess_html(self, soup):
def postprocess_html(self, soup, first_fetch):
bad = []
for table in soup.findAll('table'):
if table.find(text=re.compile(r'\(\d+ of \d+\)')):

View File

@ -7,14 +7,16 @@ __docformat__ = 'restructuredtext en'
sciam.com
'''
import re
from lxml import html
from calibre.web.feeds.news import BasicNewsRecipe
class ScientificAmerican(BasicNewsRecipe):
title = u'Scientific American'
description = u'Popular science'
description = u'Popular science. Monthly magazine.'
__author__ = 'Kovid Goyal'
oldest_article = 30
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
remove_tags_before = dict(name='div', attrs={'class':'headline'})
remove_tags_after = dict(id='article')
@ -26,25 +28,102 @@ class ScientificAmerican(BasicNewsRecipe):
html2lrf_options = ['--base-font-size', '8']
recursions = 1
match_regexps = [r'article.cfm.id=\S+page=(2|3|4|5|6|7|8|9|10|11|12|13|14)']
feeds = [
(u'Latest News', u'http://rss.sciam.com/ScientificAmerican-News'),
(u'Global', u'http://rss.sciam.com/ScientificAmerican-Global'),
(u'Health', u'http://rss.sciam.com/sciam/health'),
(u'Space', u'http://rss.sciam.com/sciam/space'),
(u'Technology', u'http://rss.sciam.com/sciam/technology'),
(u'Biology', u'http://rss.sciam.com/sciam/biology'),
(u'Mind & Brain', u'http://rss.sciam.com/sciam/mind-and-brain'),
(u"What's Next", u'http://rss.sciam.com/sciam/whats-next'),
(u'Archeology and Paleontology', u'http://www.sciam.com/page.cfm?section=rsscategory&alias=archaeology-and-paleontology'),
(u'Physics', u'http://www.sciam.com/page.cfm?section=rsscategory&alias=physics'),
(u'Math', u'http://rss.sciam.com/sciam/math'),
(u'History of Science', u'http://www.sciam.com/page.cfm?section=rsscategory&alias=history-of-science'),
(u'Chemistry', u'http://rss.sciam.com/sciam/chemistry'),
(u'Mind Matters', u'http://rss.sciam.com/ScientificAmerican-MindBlog')
]
# feeds = [
# (u'Latest News', u'http://rss.sciam.com/ScientificAmerican-News'),
# (u'Global', u'http://rss.sciam.com/ScientificAmerican-Global'),
# (u'Health', u'http://rss.sciam.com/sciam/health'),
# (u'Space', u'http://rss.sciam.com/sciam/space'),
# (u'Technology', u'http://rss.sciam.com/sciam/technology'),
# (u'Biology', u'http://rss.sciam.com/sciam/biology'),
# (u'Mind & Brain', u'http://rss.sciam.com/sciam/mind-and-brain'),
# (u"What's Next", u'http://rss.sciam.com/sciam/whats-next'),
# (u'Archeology and Paleontology', u'http://www.sciam.com/page.cfm?section=rsscategory&alias=archaeology-and-paleontology'),
# (u'Physics', u'http://www.sciam.com/page.cfm?section=rsscategory&alias=physics'),
# (u'Math', u'http://rss.sciam.com/sciam/math'),
# (u'History of Science', u'http://www.sciam.com/page.cfm?section=rsscategory&alias=history-of-science'),
# (u'Chemistry', u'http://rss.sciam.com/sciam/chemistry'),
# (u'Mind Matters', u'http://rss.sciam.com/ScientificAmerican-MindBlog')
# ]
#
def parse_index(self):
src = self.browser.open('http://www.sciam.com/sciammag/').read()
root = html.fromstring(src)
self.cover_url = root.xpath('//img[re:match(@src, "cover_")]',
namespaces={'re':'http://exslt.org/regular-expressions'}
)[0].get('src')
self.timefmt = root.xpath('//div[@id = "magazine-month"]')[0].text
feeds = []
features = []
for a in root.xpath('//a[@href and @title = "Feature"]'):
if not a.text.strip():
continue
article = {
'url' : a.get('href'),
'title' : u''.join(a.xpath('./text()')),
'date' : '',
'description' : '',
}
for s in a.itersiblings('span'):
if s.get('class', '') == 'sub':
article['description'] += u''.join(s.xpath('./text()')) + ' '
features.append(article)
if features:
feeds.append(('Features', features))
departments = []
for a in root.xpath('//a[@href and @class="title"]'):
txt = u''.join(a.xpath('./text()')).strip()
if not txt:
continue
article = {
'url' : a.get('href'),
'title' : txt,
'date' : '',
'description' : '',
}
p = a.getparent()
p.remove(a)
article['description'] = u''.join(p.xpath('./text()'))
departments.append(article)
feeds.append(('Departments', departments))
opinion = []
for a in root.xpath('//div[@id = "opinion"]//a[@href]'):
txt = u''.join(a.xpath('./text()')).strip()
if not txt:
continue
article = {
'url' : a.get('href'),
'title' : txt,
'date' : '',
'description' : '',
}
opinion.append(article)
feeds.append(('Opinion', opinion))
ontheweb = []
for a in root.xpath('//div[@id = "ontheweb"]//a[@href]'):
txt = u''.join(a.xpath('./text()')).strip()
if not txt:
continue
article = {
'url' : a.get('href'),
'title' : txt,
'date' : '',
'description' : '',
}
ontheweb.append(article)
feeds.append(('On the web', ontheweb))
return feeds
def postprocess_html(self, soup):
def postprocess_html(self, soup, first_fetch):
if soup is not None:
for span in soup.findAll('span', attrs={'class':'pagination'}):
span.extract()
if not first_fetch:
div = soup.find('div', attrs={'class':'headline'})
if div:
div.extract()
return soup

View File

@ -198,7 +198,7 @@ class RecursiveFetcher(object, LoggingInterface):
try:
f = self.fetch_url(iurl)
except Exception, err:
self.log_warning('Could not fetch stylesheet %s', iurl)
self.log_debug('Could not fetch stylesheet %s', iurl)
self.log_debug('Error: %s', str(err), exc_info=True)
continue
stylepath = os.path.join(diskpath, 'style'+str(c)+'.css')