Merge from trunk

This commit is contained in:
Charles Haley 2013-02-03 16:22:16 +01:00
commit d328d20bc6
9 changed files with 275 additions and 35 deletions

View File

@ -14,7 +14,7 @@ class AdvancedUserRecipe1306097511(BasicNewsRecipe):
remove_empty_feeds = True
remove_javascript = True
no_stylesheets = True
#auto_cleanup = True
auto_cleanup = True
language = 'en_GB'
cover_url = 'http://profile.ak.fbcdn.net/hprofile-ak-snc4/161987_9010212100_2035706408_n.jpg'
@ -23,7 +23,7 @@ class AdvancedUserRecipe1306097511(BasicNewsRecipe):
def get_cover_url(self):
soup = self.index_to_soup('http://www.birminghampost.net')
# look for the block containing the sun button and url
cov = soup.find(attrs={'height' : re.compile('3'), 'alt' : re.compile('Birmingham Post')})
cov = soup.find(attrs={'height' : re.compile('3'), 'alt' : re.compile('Post')})
print
print '%%%%%%%%%%%%%%%',cov
print
@ -43,20 +43,7 @@ class AdvancedUserRecipe1306097511(BasicNewsRecipe):
return cover_url
keep_only_tags = [
dict(attrs={'id' : 'article-header'}),
#dict(name='h1',attrs={'id' : 'article-header'}),
dict(attrs={'class':['article-meta-author','article-meta-date','article main','art-o art-align-center otm-1 ']}),
dict(name='div',attrs={'class' : 'article-image full'}),
dict(attrs={'clas' : 'art-o art-align-center otm-1 '}),
dict(name='div',attrs={'class' : 'article main'}),
#dict(name='p')
#dict(attrs={'id' : 'three-col'})
]
remove_tags = [
# dict(name='div',attrs={'class' : 'span-33 last header-links'})
]
feeds = [
#(u'News',u'http://www.birminghampost.net/news/rss.xml'),
(u'West Mids. News', u'http://www.birminghampost.net/news/west-midlands-news/rss.xml'),
@ -65,9 +52,3 @@ class AdvancedUserRecipe1306097511(BasicNewsRecipe):
(u'Bloggs & Comments',u'http://www.birminghampost.net/comment/rss.xml')
]
extra_css = '''
h1{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:large;text-align:center;}
h2{font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:small;}
p{font-family:Arial,Helvetica,sans-serif;font-size:small;}
body{font-family:Helvetica,Arial,sans-serif;font-size:small;}
'''

View File

@ -62,7 +62,8 @@ class HBR(BasicNewsRecipe):
today = date.today()
future = today + timedelta(days=30)
for x in [x.strftime('%y%m') for x in (future, today)]:
past = today - timedelta(days=30)
for x in [x.strftime('%y%m') for x in (future, today, past)]:
url = self.INDEX + x
soup = self.index_to_soup(url)
if (not soup.find(text='Issue Not Found') and not soup.find(

Binary file not shown.

View File

@ -26,7 +26,7 @@ def get_opts_from_parser(parser):
class Coffee(Command): # {{{
description = 'Compile coffeescript files into javascript'
COFFEE_DIRS = ('ebooks/oeb/display',)
COFFEE_DIRS = ('ebooks/oeb/display', 'ebooks/oeb/polish')
def add_options(self, parser):
parser.add_option('--watch', '-w', action='store_true', default=False,

View File

@ -216,7 +216,7 @@ class ANDROID(USBMS):
'PMP5097C', 'MASS', 'NOVO7', 'ZEKI', 'COBY', 'SXZ', 'USB_2.0',
'COBY_MID', 'VS', 'AINOL', 'TOPWISE', 'PAD703', 'NEXT8D12',
'MEDIATEK', 'KEENHI', 'TECLAST']
WINDOWS_MAIN_MEM = ['ANDROID_PHONE', 'A855', 'A853', 'INC.NEXUS_ONE',
WINDOWS_MAIN_MEM = ['ANDROID_PHONE', 'A855', 'A853', 'A953', 'INC.NEXUS_ONE',
'__UMS_COMPOSITE', '_MB200', 'MASS_STORAGE', '_-_CARD', 'SGH-I897',
'GT-I9000', 'FILE-STOR_GADGET', 'SGH-T959_CARD', 'SGH-T959', 'SAMSUNG_ANDROID',
'SCH-I500_CARD', 'SPH-D700_CARD', 'MB810', 'GT-P1000', 'DESIRE',

View File

@ -241,7 +241,7 @@ class KF8Writer(object):
j = 0
for tag in root.iterdescendants(etree.Element):
id_ = tag.attrib.get('id', None)
if id_ is None:
if id_ is None and tag.tag == XHTML('a'):
# Can happen during tweaking
id_ = tag.attrib.get('name', None)
if id_ is not None:

View File

@ -7,7 +7,7 @@ __license__ = 'GPL v3'
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import os, posixpath, logging, sys, hashlib, uuid
import os, logging, sys, hashlib, uuid
from urllib import unquote as urlunquote
from lxml import etree
@ -56,7 +56,7 @@ class Container(object):
# Special case if we have stumbled onto the opf
if path == opfpath:
self.opf_name = name
self.opf_dir = posixpath.dirname(path)
self.opf_dir = os.path.dirname(path)
self.mime_map[name] = guess_type('a.opf')[0]
# Update mime map with data from the OPF
@ -66,13 +66,25 @@ class Container(object):
href = item.get('href')
self.mime_map[self.href_to_name(href)] = item.get('media-type')
def abspath_to_name(self, fullpath):
return self.relpath(os.path.abspath(fullpath)).replace(os.sep, '/')
def href_to_name(self, href, base=None):
'''
Convert an href (relative to base) to a name (i.e. a path
relative to self.root with POSIX separators).
base must be an absolute path with OS separators or None, in which case
the href is interpreted relative to the dir containing the OPF.
'''
if base is None:
base = self.opf_dir
href = urlunquote(href.partition('#')[0])
fullpath = posixpath.abspath(posixpath.join(base, href))
return self.relpath(fullpath)
fullpath = os.path.join(base, *href.split('/'))
return self.abspath_to_name(fullpath)
def has_name(self, name):
return name in self.name_path_map
def relpath(self, path):
return relpath(path, self.root)
@ -345,10 +357,14 @@ class AZW3Container(Container):
super(AZW3Container, self).__init__(tdir, opf_path, log)
self.obfuscated_fonts = {x.replace(os.sep, '/') for x in obfuscated_fonts}
def get_container(path, log=None):
if log is None: log = default_log
ebook = (AZW3Container if path.rpartition('.')[-1].lower() in {'azw3', 'mobi'}
else EpubContainer)(path, log)
return ebook
if __name__ == '__main__':
f = sys.argv[-1]
ebook = (AZW3Container if f.rpartition('.')[-1].lower() in {'azw3', 'mobi'}
else EpubContainer)(f, default_log)
ebook = get_container(sys.argv[-1])
for s in ebook.spine_items:
print (ebook.relpath(s))

View File

@ -0,0 +1,72 @@
#!/usr/bin/env coffee
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
###
Copyright 2013, Kovid Goyal <kovid at kovidgoyal.net>
Released under the GPLv3 License
###
if window?.calibre_utils
log = window.calibre_utils.log
font_dict = (style, computed=false) ->
if computed
fams = []
family = style.getPropertyCSSValue('font-family')
if family.cssValueType == CSSValue.CSS_PRIMITIVE_VALUE
fams.push(family.getStringValue())
else
for f in family
fams.push(f.getStringValue())
else
fams = style.getPropertyValue('font-family')
return {
'font-family':fams,
'font-weight':style.getPropertyValue('font-weight'),
'font-style':style.getPropertyValue('font-style'),
'font-stretch':style.getPropertyValue('font-stretch'),
}
font_usage = (node) ->
style = window.getComputedStyle(node, null)
ans = font_dict(style, true)
text = []
for child in node.childNodes
if child.nodeType == Node.TEXT_NODE
text.push(child.nodeValue)
ans['text'] = text
return ans
class FontStats
# This class is a namespace to expose functions via the
# window.font_stats object.
constructor: () ->
if not this instanceof arguments.callee
throw new Error('FontStats constructor called as function')
get_font_face_rules: () ->
font_faces = []
for sheet in document.styleSheets
for rule in sheet.cssRules
if rule.type == rule.FONT_FACE_RULE
fd = font_dict(rule.style)
fd['src'] = rule.style.getPropertyValue('src')
font_faces.push(fd)
py_bridge.value = font_faces
get_font_usage: () ->
ans = []
busage = font_usage(document.body)
if busage != null
ans.push(busage)
for node in document.body.getElementsByTagName('*')
usage = font_usage(node)
if usage != null
ans.push(usage)
py_bridge.value = ans
if window?
window.font_stats = new FontStats()

View File

@ -7,19 +7,107 @@ __license__ = 'GPL v3'
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import json
import json, sys, os
from urllib import unquote
from cssutils import parseStyle
from PyQt4.Qt import (QWebPage, pyqtProperty, QString, QEventLoop, QWebView,
Qt, QSize, QTimer)
from calibre.constants import iswindows
from calibre.ebooks.oeb.display.webview import load_html
from calibre.gui2 import must_use_qt
class Page(QWebPage):
def normalize_font_properties(font):
w = font.get('font-weight', None)
if not w and w != 0:
w = 'normal'
w = unicode(w)
w = {'normal':'400', 'bold':'700'}.get(w, w)
if w not in {'100', '200', '300', '400', '500', '600', '700',
'800', '900'}:
w = '400'
font['font-weight'] = w
val = font.get('font-style', None)
if val not in {'normal', 'italic', 'oblique'}:
val = 'normal'
font['font-style'] = val
val = font.get('font-stretch', None)
if val not in {'normal', 'ultra-condensed', 'extra-condensed', 'condensed',
'semi-condensed', 'semi-expanded', 'expanded',
'extra-expanded', 'ultra-expanded'}:
val = 'normal'
font['font-stretch'] = val
widths = {x:i for i, x in enumerate(( 'ultra-condensed',
'extra-condensed', 'condensed', 'semi-condensed', 'normal',
'semi-expanded', 'expanded', 'extra-expanded', 'ultra-expanded'
))}
def get_matching_rules(rules, font):
normalize_font_properties(font)
matches = []
# Filter on family
for rule in reversed(rules):
ff = frozenset(icu_lower(x) for x in font.get('font-family', []))
if ff.intersection(rule['font-family']):
matches.append(rule)
if not matches:
return []
# Filter on font stretch
width = widths[font.get('font-stretch', 'normal')]
min_dist = min(abs(width-f['width']) for f in matches)
nearest = [f for f in matches if abs(width-f['width']) ==
min_dist]
if width <= 4:
lmatches = [f for f in nearest if f['width'] <= width]
else:
lmatches = [f for f in nearest if f['width'] >= width]
matches = (lmatches or nearest)
# Filter on font-style
fs = font.get('font-style', 'normal')
order = {
'oblique':['oblique', 'italic', 'normal'],
'normal':['normal', 'oblique', 'italic']
}.get(fs, ['italic', 'oblique', 'normal'])
for q in order:
m = [f for f in matches if f.get('font-style', 'normal') == q]
if m:
matches = m
break
# Filter on font weight
fw = int(font.get('font-weight', '400'))
if fw == 400:
q = [400, 500, 300, 200, 100, 600, 700, 800, 900]
elif fw == 500:
q = [500, 400, 300, 200, 100, 600, 700, 800, 900]
elif fw < 400:
q = [fw] + list(xrange(fw-100, -100, -100)) + list(xrange(fw+100,
100, 1000))
else:
q = [fw] + list(xrange(fw+100, 100, 1000)) + list(xrange(fw-100,
-100, -100))
for wt in q:
m = [f for f in matches if f['weight'] == wt]
if m:
return m
return []
class Page(QWebPage): # {{{
def __init__(self, log):
self.log = log
QWebPage.__init__(self)
self.js = None
self.evaljs = self.mainFrame().evaluateJavaScript
self.bridge_value = None
def javaScriptConsoleMessage(self, msg, lineno, msgid):
self.log(u'JS:', unicode(msg))
@ -40,6 +128,23 @@ class Page(QWebPage):
_pass_json_value = pyqtProperty(QString, fget=_pass_json_value_getter,
fset=_pass_json_value_setter)
def load_js(self):
if self.js is None:
from calibre.utils.resources import compiled_coffeescript
self.js = compiled_coffeescript('ebooks.oeb.display.utils')
self.js += compiled_coffeescript('ebooks.oeb.polish.font_stats')
self.mainFrame().addToJavaScriptWindowObject("py_bridge", self)
self.evaljs(self.js)
self.evaljs('''
py_bridge.__defineGetter__('value', function() {
return JSON.parse(this._pass_json_value);
});
py_bridge.__defineSetter__('value', function(val) {
this._pass_json_value = JSON.stringify(val);
});
''')
# }}}
class StatsCollector(object):
def __init__(self, container):
@ -85,6 +190,7 @@ class StatsCollector(object):
self.loop.exit(1)
return
try:
self.page.load_js()
self.collect_font_stats()
except:
self.log.exception('Failed to collect font stats from: %s'%self.container.relpath(self.current_item))
@ -94,6 +200,70 @@ class StatsCollector(object):
self.render_book()
def collect_font_stats(self):
pass
self.page.evaljs('window.font_stats.get_font_face_rules()')
font_face_rules = self.page.bridge_value
if not isinstance(font_face_rules, list):
raise Exception('Unknown error occurred while reading font-face rules')
# Weed out invalid font-face rules
rules = []
for rule in font_face_rules:
ff = rule.get('font-family', None)
if not ff: continue
style = parseStyle('font-family:%s'%ff, validate=False)
ff = [x.value for x in
style.getProperty('font-family').propertyValue]
if not ff or ff[0] == 'inherit':
continue
rule['font-family'] = frozenset(icu_lower(f) for f in ff)
src = rule.get('src', None)
if not src: continue
style = parseStyle('background-image:%s'%src, validate=False)
src = style.getProperty('background-image').propertyValue[0].uri
if not src.startswith('file://'):
self.log.warn('Unknown URI in @font-face: %r'%src)
continue
src = src[len('file://'):]
if iswindows and src.startswith('/'):
src = src[1:]
src = src.replace('/', os.sep)
src = unquote(src)
name = self.container.abspath_to_name(src)
if not self.container.has_name(name):
self.log.warn('Font %r referenced in @font-face rule not found'
%name)
continue
rule['src'] = name
normalize_font_properties(rule)
rule['width'] = widths[rule['font-stretch']]
rule['weight'] = int(rule['font-weight'])
rules.append(rule)
if not rules:
return
for rule in rules:
if rule['src'] not in self.font_stats:
self.font_stats[rule['src']] = set()
self.page.evaljs('window.font_stats.get_font_usage()')
font_usage = self.page.bridge_value
if not isinstance(font_usage, list):
raise Exception('Unknown error occurred while reading font usage')
exclude = {'\n', '\r', '\t'}
for font in font_usage:
text = set()
for t in font['text']:
text |= frozenset(t)
text.difference_update(exclude)
if not text: continue
for rule in get_matching_rules(rules, font):
self.font_stats[rule['src']] |= text
if __name__ == '__main__':
from calibre.ebooks.oeb.polish.container import get_container
from calibre.utils.logging import default_log
default_log.filter_level = default_log.DEBUG
ebook = get_container(sys.argv[-1], default_log)
print (StatsCollector(ebook).font_stats)