Merge from trunk

This commit is contained in:
Charles Haley 2013-04-04 09:27:51 +02:00
commit 22a1422ec6
11 changed files with 628 additions and 237 deletions

290
recipes/am730.recipe Normal file
View File

@ -0,0 +1,290 @@
# vim:fileencoding=UTF-8
from __future__ import unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2013, Eddie Lau'
__Date__ = ''
__HiResImg__ = True
'''
Change Log:
2013/03/30 -- first version
'''
from calibre import (__appname__, force_unicode, strftime)
from calibre.utils.date import now as nowf
import os, datetime, re
from calibre.web.feeds.recipes import BasicNewsRecipe
from contextlib import nested
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag
from calibre.ebooks.metadata.opf2 import OPFCreator
from calibre.ebooks.metadata.toc import TOC
from calibre.ebooks.metadata import MetaInformation
from calibre.utils.localization import canonicalize_lang
class AppleDaily(BasicNewsRecipe):
title = u'AM730'
__author__ = 'Eddie Lau'
publisher = 'AM730'
oldest_article = 1
max_articles_per_feed = 100
auto_cleanup = False
language = 'zh'
encoding = 'utf-8'
auto_cleanup = False
remove_javascript = True
use_embedded_content = False
no_stylesheets = True
description = 'http://www.am730.com.hk'
category = 'Chinese, News, Hong Kong'
masthead_url = 'http://www.am730.com.hk/images/logo.jpg'
extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px; max-height:90%;} div[id=articleHeader] {font-size:200%; text-align:left; font-weight:bold;} photocaption {font-size:50%; margin-left:auto; margin-right:auto;}'
keep_only_tags = [dict(name='div', attrs={'id':'articleHeader'}),
dict(name='div', attrs={'class':'thecontent wordsnap'}),
dict(name='a', attrs={'class':'lightboximg'})]
remove_tags = [dict(name='img', attrs={'src':'/images/am730_article_logo.jpg'}),
dict(name='img', attrs={'src':'/images/am_endmark.gif'})]
def get_dtlocal(self):
dt_utc = datetime.datetime.utcnow()
# convert UTC to local hk time - at HKT 6am, all news are available
return dt_utc + datetime.timedelta(8.0/24) - datetime.timedelta(6.0/24)
def get_fetchdate(self):
if __Date__ <> '':
return __Date__
else:
return self.get_dtlocal().strftime("%Y%m%d")
def get_fetchformatteddate(self):
if __Date__ <> '':
return __Date__[0:4]+'-'+__Date__[4:6]+'-'+__Date__[6:8]
else:
return self.get_dtlocal().strftime("%Y-%m-%d")
def get_fetchyear(self):
if __Date__ <> '':
return __Date__[0:4]
else:
return self.get_dtlocal().strftime("%Y")
def get_fetchmonth(self):
if __Date__ <> '':
return __Date__[4:6]
else:
return self.get_dtlocal().strftime("%m")
def get_fetchday(self):
if __Date__ <> '':
return __Date__[6:8]
else:
return self.get_dtlocal().strftime("%d")
# Note: does not work with custom date given by __Date__
def get_weekday(self):
return self.get_dtlocal().weekday()
def populate_article_metadata(self, article, soup, first):
if first and hasattr(self, 'add_toc_thumbnail'):
picdiv = soup.find('img')
if picdiv is not None:
self.add_toc_thumbnail(article,picdiv['src'])
def parse_index(self):
feeds = []
soup = self.index_to_soup('http://www.am730.com.hk/')
ul = soup.find(attrs={'class':'nav-section'})
sectionList = []
for li in ul.findAll('li'):
a = 'http://www.am730.com.hk/' + li.find('a', href=True).get('href', False)
title = li.find('a').get('title', False).strip()
sectionList.append((title, a))
for title, url in sectionList:
articles = self.parse_section(url)
if articles:
feeds.append((title, articles))
return feeds
def parse_section(self, url):
soup = self.index_to_soup(url)
items = soup.findAll(attrs={'style':'padding-bottom: 15px;'})
current_articles = []
for item in items:
a = item.find(attrs={'class':'t6 f14'}).find('a', href=True)
articlelink = 'http://www.am730.com.hk/' + a.get('href', True)
title = self.tag_to_string(a)
description = self.tag_to_string(item.find(attrs={'class':'t3 f14'}))
current_articles.append({'title': title, 'url': articlelink, 'description': description})
return current_articles
def preprocess_html(self, soup):
multia = soup.findAll('a')
for a in multia:
if not (a == None):
image = a.find('img')
if not (image == None):
if __HiResImg__:
image['src'] = image.get('src').replace('/thumbs/', '/')
caption = image.get('alt')
tag = Tag(soup, "photo", [])
tag2 = Tag(soup, "photocaption", [])
tag.insert(0, image)
if not caption == None:
tag2.insert(0, caption)
tag.insert(1, tag2)
a.replaceWith(tag)
return soup
def create_opf(self, feeds, dir=None):
if dir is None:
dir = self.output_dir
title = self.short_title()
if self.output_profile.periodical_date_in_title:
title += strftime(self.timefmt)
mi = MetaInformation(title, [__appname__])
mi.publisher = __appname__
mi.author_sort = __appname__
if self.publication_type:
mi.publication_type = 'periodical:'+self.publication_type+':'+self.short_title()
mi.timestamp = nowf()
article_titles, aseen = [], set()
for f in feeds:
for a in f:
if a.title and a.title not in aseen:
aseen.add(a.title)
article_titles.append(force_unicode(a.title, 'utf-8'))
mi.comments = self.description
if not isinstance(mi.comments, unicode):
mi.comments = mi.comments.decode('utf-8', 'replace')
mi.comments += ('\n\n' + _('Articles in this issue: ') + '\n' +
'\n\n'.join(article_titles))
language = canonicalize_lang(self.language)
if language is not None:
mi.language = language
# This one affects the pub date shown in kindle title
#mi.pubdate = nowf()
# now appears to need the time field to be > 12.00noon as well
mi.pubdate = datetime.datetime(int(self.get_fetchyear()), int(self.get_fetchmonth()), int(self.get_fetchday()), 12, 30, 0)
opf_path = os.path.join(dir, 'index.opf')
ncx_path = os.path.join(dir, 'index.ncx')
opf = OPFCreator(dir, mi)
# Add mastheadImage entry to <guide> section
mp = getattr(self, 'masthead_path', None)
if mp is not None and os.access(mp, os.R_OK):
from calibre.ebooks.metadata.opf2 import Guide
ref = Guide.Reference(os.path.basename(self.masthead_path), os.getcwdu())
ref.type = 'masthead'
ref.title = 'Masthead Image'
opf.guide.append(ref)
manifest = [os.path.join(dir, 'feed_%d'%i) for i in range(len(feeds))]
manifest.append(os.path.join(dir, 'index.html'))
manifest.append(os.path.join(dir, 'index.ncx'))
# Get cover
cpath = getattr(self, 'cover_path', None)
if cpath is None:
pf = open(os.path.join(dir, 'cover.jpg'), 'wb')
if self.default_cover(pf):
cpath = pf.name
if cpath is not None and os.access(cpath, os.R_OK):
opf.cover = cpath
manifest.append(cpath)
# Get masthead
mpath = getattr(self, 'masthead_path', None)
if mpath is not None and os.access(mpath, os.R_OK):
manifest.append(mpath)
opf.create_manifest_from_files_in(manifest)
for mani in opf.manifest:
if mani.path.endswith('.ncx'):
mani.id = 'ncx'
if mani.path.endswith('mastheadImage.jpg'):
mani.id = 'masthead-image'
entries = ['index.html']
toc = TOC(base_path=dir)
self.play_order_counter = 0
self.play_order_map = {}
def feed_index(num, parent):
f = feeds[num]
for j, a in enumerate(f):
if getattr(a, 'downloaded', False):
adir = 'feed_%d/article_%d/'%(num, j)
auth = a.author
if not auth:
auth = None
desc = a.text_summary
if not desc:
desc = None
else:
desc = self.description_limiter(desc)
tt = a.toc_thumbnail if a.toc_thumbnail else None
entries.append('%sindex.html'%adir)
po = self.play_order_map.get(entries[-1], None)
if po is None:
self.play_order_counter += 1
po = self.play_order_counter
parent.add_item('%sindex.html'%adir, None,
a.title if a.title else _('Untitled Article'),
play_order=po, author=auth,
description=desc, toc_thumbnail=tt)
last = os.path.join(self.output_dir, ('%sindex.html'%adir).replace('/', os.sep))
for sp in a.sub_pages:
prefix = os.path.commonprefix([opf_path, sp])
relp = sp[len(prefix):]
entries.append(relp.replace(os.sep, '/'))
last = sp
if os.path.exists(last):
with open(last, 'rb') as fi:
src = fi.read().decode('utf-8')
soup = BeautifulSoup(src)
body = soup.find('body')
if body is not None:
prefix = '/'.join('..'for i in range(2*len(re.findall(r'link\d+', last))))
templ = self.navbar.generate(True, num, j, len(f),
not self.has_single_feed,
a.orig_url, __appname__, prefix=prefix,
center=self.center_navbar)
elem = BeautifulSoup(templ.render(doctype='xhtml').decode('utf-8')).find('div')
body.insert(len(body.contents), elem)
with open(last, 'wb') as fi:
fi.write(unicode(soup).encode('utf-8'))
if len(feeds) == 0:
raise Exception('All feeds are empty, aborting.')
if len(feeds) > 1:
for i, f in enumerate(feeds):
entries.append('feed_%d/index.html'%i)
po = self.play_order_map.get(entries[-1], None)
if po is None:
self.play_order_counter += 1
po = self.play_order_counter
auth = getattr(f, 'author', None)
if not auth:
auth = None
desc = getattr(f, 'description', None)
if not desc:
desc = None
feed_index(i, toc.add_item('feed_%d/index.html'%i, None,
f.title, play_order=po, description=desc, author=auth))
else:
entries.append('feed_%d/index.html'%0)
feed_index(0, toc)
for i, p in enumerate(entries):
entries[i] = os.path.join(dir, p.replace('/', os.sep))
opf.create_spine(entries)
opf.set_toc(toc)
with nested(open(opf_path, 'wb'), open(ncx_path, 'wb')) as (opf_file, ncx_file):
opf.render(opf_file, ncx_file)

View File

@ -1,161 +1,275 @@
# -*- coding: utf-8 -*- # vim:fileencoding=UTF-8
import re from __future__ import unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2013, Eddie Lau'
__Date__ = ''
from calibre import (__appname__, force_unicode, strftime)
from calibre.utils.date import now as nowf
import os, datetime, re
from calibre.web.feeds.recipes import BasicNewsRecipe from calibre.web.feeds.recipes import BasicNewsRecipe
from contextlib import nested
from calibre.ebooks.BeautifulSoup import BeautifulSoup
from calibre.ebooks.metadata.opf2 import OPFCreator
from calibre.ebooks.metadata.toc import TOC
from calibre.ebooks.metadata import MetaInformation
from calibre.utils.localization import canonicalize_lang
class AppleDaily(BasicNewsRecipe): class AppleDaily(BasicNewsRecipe):
title = u'蘋果日報 (香港)'
title = u'蘋果日報' __author__ = 'Eddie Lau'
__author__ = u'蘋果日報' publisher = '蘋果日報'
__publisher__ = u'蘋果日報' oldest_article = 1
description = u'蘋果日報' max_articles_per_feed = 100
masthead_url = 'http://hk.apple.nextmedia.com/template/common/header/2009/images/atnextheader_logo_appledaily.gif' auto_cleanup = False
language = 'zh_TW' language = 'zh'
encoding = 'UTF-8' encoding = 'utf-8'
timefmt = ' [%a, %d %b, %Y]' auto_cleanup = False
needs_subscription = False
remove_javascript = True remove_javascript = True
remove_tags_before = dict(name=['ul', 'h1']) use_embedded_content = False
remove_tags_after = dict(name='form')
remove_tags = [dict(attrs={'class':['articleTools', 'post-tools', 'side_tool', 'nextArticleLink clearfix']}),
dict(id=['footer', 'toolsRight', 'articleInline', 'navigation', 'archive', 'side_search', 'blog_sidebar', 'side_tool', 'side_index']),
dict(name=['script', 'noscript', 'style', 'form'])]
no_stylesheets = True no_stylesheets = True
extra_css = ''' description = 'http://hkm.appledaily.com/'
@font-face {font-family: "uming", serif, sans-serif; src: url(res:///usr/share/fonts/truetype/arphic/uming.ttc); }\n category = 'Chinese, News, Hong Kong'
body {margin-right: 8pt; font-family: 'uming', serif;} masthead_url = 'http://upload.wikimedia.org/wikipedia/zh/c/cf/AppleDailyLogo1.png'
h1 {font-family: 'uming', serif, sans-serif}
'''
#extra_css = 'h1 {font: sans-serif large;}\n.byline {font:monospace;}'
preprocess_regexps = [ extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px; max-height:90%;} h1 {font-size:200%; text-align:left; font-weight:bold;} p[class=video-caption] {font-size:50%; margin-left:auto; margin-right:auto;}'
(re.compile(r'img.php?server=(?P<server>[^&]+)&path=(?P<path>[^&]+).*', re.DOTALL|re.IGNORECASE), keep_only_tags = [dict(name='div', attrs={'id':'content-article'})]
lambda match: 'http://' + match.group('server') + '/' + match.group('path')), remove_tags = [dict(name='div', attrs={'class':'prev-next-btn'}),
] dict(name='p', attrs={'class':'next'})]
def get_dtlocal(self):
dt_utc = datetime.datetime.utcnow()
# convert UTC to local hk time - at HKT 6am, all news are available
return dt_utc + datetime.timedelta(8.0/24) - datetime.timedelta(6.0/24)
def get_fetchdate(self):
if __Date__ <> '':
return __Date__
else:
return self.get_dtlocal().strftime("%Y%m%d")
def get_fetchformatteddate(self):
if __Date__ <> '':
return __Date__[0:4]+'-'+__Date__[4:6]+'-'+__Date__[6:8]
else:
return self.get_dtlocal().strftime("%Y-%m-%d")
def get_fetchyear(self):
if __Date__ <> '':
return __Date__[0:4]
else:
return self.get_dtlocal().strftime("%Y")
def get_fetchmonth(self):
if __Date__ <> '':
return __Date__[4:6]
else:
return self.get_dtlocal().strftime("%m")
def get_fetchday(self):
if __Date__ <> '':
return __Date__[6:8]
else:
return self.get_dtlocal().strftime("%d")
# Note: does not work with custom date given by __Date__
def get_weekday(self):
return self.get_dtlocal().weekday()
def get_cover_url(self): def get_cover_url(self):
return 'http://hk.apple.nextmedia.com/template/common/header/2009/images/atnextheader_logo_appledaily.gif' soup = self.index_to_soup('http://hkm.appledaily.com/')
cover = soup.find(attrs={'class':'top-news'}).get('src', False)
br = BasicNewsRecipe.get_browser(self)
#def get_browser(self): try:
#br = BasicNewsRecipe.get_browser(self) br.open(cover)
#if self.username is not None and self.password is not None: except:
# br.open('http://www.nytimes.com/auth/login') cover = None
# br.select_form(name='login') return cover
# br['USERID'] = self.username
# br['PASSWORD'] = self.password
# br.submit()
#return br
def preprocess_html(self, soup):
#process all the images
for tag in soup.findAll(lambda tag: tag.name.lower()=='img' and tag.has_key('src')):
iurl = tag['src']
#print 'checking image: ' + iurl
#img\.php?server\=(?P<server>[^&]+)&path=(?P<path>[^&]+)
p = re.compile(r'img\.php\?server=(?P<server>[^&]+)&path=(?P<path>[^&]+)', re.DOTALL|re.IGNORECASE)
m = p.search(iurl)
if m is not None:
iurl = 'http://' + m.group('server') + '/' + m.group('path')
#print 'working! new url: ' + iurl
tag['src'] = iurl
#else:
#print 'not good'
for tag in soup.findAll(lambda tag: tag.name.lower()=='a' and tag.has_key('href')):
iurl = tag['href']
#print 'checking image: ' + iurl
#img\.php?server\=(?P<server>[^&]+)&path=(?P<path>[^&]+)
p = re.compile(r'img\.php\?server=(?P<server>[^&]+)&path=(?P<path>[^&]+)', re.DOTALL|re.IGNORECASE)
m = p.search(iurl)
if m is not None:
iurl = 'http://' + m.group('server') + '/' + m.group('path')
#print 'working! new url: ' + iurl
tag['href'] = iurl
#else:
#print 'not good'
return soup
def populate_article_metadata(self, article, soup, first):
if first and hasattr(self, 'add_toc_thumbnail'):
picdiv = soup.find('img')
if picdiv is not None:
self.add_toc_thumbnail(article,picdiv['src'])
def parse_index(self): def parse_index(self):
base = 'http://news.hotpot.hk/fruit' feeds = []
soup = self.index_to_soup('http://news.hotpot.hk/fruit/index.php') soup = self.index_to_soup('http://hkm.appledaily.com/')
ul = soup.find(attrs={'class':'menu'})
sectionList = []
for li in ul.findAll('li'):
a = 'http://hkm.appledaily.com/' + li.find('a', href=True).get('href', False)
title = li.find('a', text=True).strip()
if not title == u'動新聞':
sectionList.append((title, a))
for title, url in sectionList:
articles = self.parse_section(url)
if articles:
feeds.append((title, articles))
return feeds
#def feed_title(div): def parse_section(self, url):
# return ''.join(div.findAll(text=True, recursive=False)).strip() soup = self.index_to_soup(url)
ul = soup.find(attrs={'class':'list'})
current_articles = []
for li in ul.findAll('li'):
a = li.find('a', href=True)
title = li.find('p', text=True).strip()
if a is not None:
current_articles.append({'title': title, 'url':'http://hkm.appledaily.com/' + a.get('href', False)})
pass
return current_articles
articles = {} def create_opf(self, feeds, dir=None):
key = None if dir is None:
ans = [] dir = self.output_dir
for div in soup.findAll('li'): title = self.short_title()
key = div.find(text=True, recursive=True); if self.output_profile.periodical_date_in_title:
#if key == u'豪情': title += strftime(self.timefmt)
# continue; mi = MetaInformation(title, [__appname__])
mi.publisher = __appname__
mi.author_sort = __appname__
if self.publication_type:
mi.publication_type = 'periodical:'+self.publication_type+':'+self.short_title()
mi.timestamp = nowf()
article_titles, aseen = [], set()
for f in feeds:
for a in f:
if a.title and a.title not in aseen:
aseen.add(a.title)
article_titles.append(force_unicode(a.title, 'utf-8'))
print 'section=' + key mi.comments = self.description
if not isinstance(mi.comments, unicode):
mi.comments = mi.comments.decode('utf-8', 'replace')
mi.comments += ('\n\n' + _('Articles in this issue: ') + '\n' +
'\n\n'.join(article_titles))
articles[key] = [] language = canonicalize_lang(self.language)
if language is not None:
mi.language = language
# This one affects the pub date shown in kindle title
#mi.pubdate = nowf()
# now appears to need the time field to be > 12.00noon as well
mi.pubdate = datetime.datetime(int(self.get_fetchyear()), int(self.get_fetchmonth()), int(self.get_fetchday()), 12, 30, 0)
opf_path = os.path.join(dir, 'index.opf')
ncx_path = os.path.join(dir, 'index.ncx')
ans.append(key) opf = OPFCreator(dir, mi)
# Add mastheadImage entry to <guide> section
mp = getattr(self, 'masthead_path', None)
if mp is not None and os.access(mp, os.R_OK):
from calibre.ebooks.metadata.opf2 import Guide
ref = Guide.Reference(os.path.basename(self.masthead_path), os.getcwdu())
ref.type = 'masthead'
ref.title = 'Masthead Image'
opf.guide.append(ref)
a = div.find('a', href=True) manifest = [os.path.join(dir, 'feed_%d'%i) for i in range(len(feeds))]
manifest.append(os.path.join(dir, 'index.html'))
manifest.append(os.path.join(dir, 'index.ncx'))
if not a: # Get cover
continue cpath = getattr(self, 'cover_path', None)
if cpath is None:
pf = open(os.path.join(dir, 'cover.jpg'), 'wb')
if self.default_cover(pf):
cpath = pf.name
if cpath is not None and os.access(cpath, os.R_OK):
opf.cover = cpath
manifest.append(cpath)
url = base + '/' + a['href'] # Get masthead
print 'url=' + url mpath = getattr(self, 'masthead_path', None)
if mpath is not None and os.access(mpath, os.R_OK):
manifest.append(mpath)
if not articles.has_key(key): opf.create_manifest_from_files_in(manifest)
articles[key] = [] for mani in opf.manifest:
if mani.path.endswith('.ncx'):
mani.id = 'ncx'
if mani.path.endswith('mastheadImage.jpg'):
mani.id = 'masthead-image'
entries = ['index.html']
toc = TOC(base_path=dir)
self.play_order_counter = 0
self.play_order_map = {}
def feed_index(num, parent):
f = feeds[num]
for j, a in enumerate(f):
if getattr(a, 'downloaded', False):
adir = 'feed_%d/article_%d/'%(num, j)
auth = a.author
if not auth:
auth = None
desc = a.text_summary
if not desc:
desc = None
else: else:
# sub page desc = self.description_limiter(desc)
subSoup = self.index_to_soup(url) tt = a.toc_thumbnail if a.toc_thumbnail else None
entries.append('%sindex.html'%adir)
po = self.play_order_map.get(entries[-1], None)
if po is None:
self.play_order_counter += 1
po = self.play_order_counter
parent.add_item('%sindex.html'%adir, None,
a.title if a.title else _('Untitled Article'),
play_order=po, author=auth,
description=desc, toc_thumbnail=tt)
last = os.path.join(self.output_dir, ('%sindex.html'%adir).replace('/', os.sep))
for sp in a.sub_pages:
prefix = os.path.commonprefix([opf_path, sp])
relp = sp[len(prefix):]
entries.append(relp.replace(os.sep, '/'))
last = sp
for subDiv in subSoup.findAll('li'): if os.path.exists(last):
subA = subDiv.find('a', href=True) with open(last, 'rb') as fi:
subTitle = subDiv.find(text=True, recursive=True) src = fi.read().decode('utf-8')
subUrl = base + '/' + subA['href'] soup = BeautifulSoup(src)
body = soup.find('body')
if body is not None:
prefix = '/'.join('..'for i in range(2*len(re.findall(r'link\d+', last))))
templ = self.navbar.generate(True, num, j, len(f),
not self.has_single_feed,
a.orig_url, __appname__, prefix=prefix,
center=self.center_navbar)
elem = BeautifulSoup(templ.render(doctype='xhtml').decode('utf-8')).find('div')
body.insert(len(body.contents), elem)
with open(last, 'wb') as fi:
fi.write(unicode(soup).encode('utf-8'))
if len(feeds) == 0:
raise Exception('All feeds are empty, aborting.')
print 'subUrl' + subUrl if len(feeds) > 1:
for i, f in enumerate(feeds):
entries.append('feed_%d/index.html'%i)
po = self.play_order_map.get(entries[-1], None)
if po is None:
self.play_order_counter += 1
po = self.play_order_counter
auth = getattr(f, 'author', None)
if not auth:
auth = None
desc = getattr(f, 'description', None)
if not desc:
desc = None
feed_index(i, toc.add_item('feed_%d/index.html'%i, None,
f.title, play_order=po, description=desc, author=auth))
articles[key].append( else:
dict(title=subTitle, entries.append('feed_%d/index.html'%0)
url=subUrl, feed_index(0, toc)
date='',
description='',
content=''))
for i, p in enumerate(entries):
entries[i] = os.path.join(dir, p.replace('/', os.sep))
opf.create_spine(entries)
opf.set_toc(toc)
# elif div['class'] in ['story', 'story headline']: with nested(open(opf_path, 'wb'), open(ncx_path, 'wb')) as (opf_file, ncx_file):
# a = div.find('a', href=True) opf.render(opf_file, ncx_file)
# if not a:
# continue
# url = re.sub(r'\?.*', '', a['href'])
# url += '?pagewanted=all'
# title = self.tag_to_string(a, use_alt=True).strip()
# description = ''
# pubdate = strftime('%a, %d %b')
# summary = div.find(True, attrs={'class':'summary'})
# if summary:
# description = self.tag_to_string(summary, use_alt=False)
#
# feed = key if key is not None else 'Uncategorized'
# if not articles.has_key(feed):
# articles[feed] = []
# if not 'podcasts' in url:
# articles[feed].append(
# dict(title=title, url=url, date=pubdate,
# description=description,
# content=''))
# ans = self.sort_index_by(ans, {'The Front Page':-1, 'Dining In, Dining Out':1, 'Obituaries':2})
ans = [(unicode(key), articles[key]) for key in ans if articles.has_key(key)]
return ans

View File

@ -37,68 +37,15 @@ class BusinessWeek(BasicNewsRecipe):
, 'language' : language , 'language' : language
} }
#remove_tags = [
#dict(attrs={'class':'inStory'})
#,dict(name=['meta','link','iframe','base','embed','object','table','th','tr','td'])
#,dict(attrs={'id':['inset','videoDisplay']})
#]
#keep_only_tags = [dict(name='div', attrs={'id':['story-body','storyBody']})]
remove_attributes = ['lang']
match_regexps = [r'http://www.businessweek.com/.*_page_[1-9].*']
feeds = [ feeds = [
(u'Top Stories', u'http://www.businessweek.com/topStories/rss/topStories.rss'), (u'Top Stories', u'http://www.businessweek.com/feeds/most-popular.rss'),
(u'Top News' , u'http://www.businessweek.com/rss/bwdaily.rss' ),
(u'Asia', u'http://www.businessweek.com/rss/asia.rss'),
(u'Autos', u'http://www.businessweek.com/rss/autos/index.rss'),
(u'Classic Cars', u'http://rss.businessweek.com/bw_rss/classiccars'),
(u'Hybrids', u'http://rss.businessweek.com/bw_rss/hybrids'),
(u'Europe', u'http://www.businessweek.com/rss/europe.rss'),
(u'Auto Reviews', u'http://rss.businessweek.com/bw_rss/autoreviews'),
(u'Innovation & Design', u'http://www.businessweek.com/rss/innovate.rss'),
(u'Architecture', u'http://www.businessweek.com/rss/architecture.rss'),
(u'Brand Equity', u'http://www.businessweek.com/rss/brandequity.rss'),
(u'Auto Design', u'http://www.businessweek.com/rss/carbuff.rss'),
(u'Game Room', u'http://rss.businessweek.com/bw_rss/gameroom'),
(u'Technology', u'http://www.businessweek.com/rss/technology.rss'),
(u'Investing', u'http://rss.businessweek.com/bw_rss/investor'),
(u'Small Business', u'http://www.businessweek.com/rss/smallbiz.rss'),
(u'Careers', u'http://rss.businessweek.com/bw_rss/careers'),
(u'B-Schools', u'http://www.businessweek.com/rss/bschools.rss'),
(u'Magazine Selections', u'http://www.businessweek.com/rss/magazine.rss'),
(u'CEO Guide to Tech', u'http://www.businessweek.com/rss/ceo_guide_tech.rss'),
] ]
def get_article_url(self, article):
url = article.get('guid', None)
if 'podcasts' in url:
return None
if 'surveys' in url:
return None
if 'images' in url:
return None
if 'feedroom' in url:
return None
if '/magazine/toc/' in url:
return None
rurl, sep, rest = url.rpartition('?')
if rurl:
return rurl
return rest
def print_version(self, url): def print_version(self, url):
if '/news/' in url or '/blog/ in url': soup = self.index_to_soup(url)
return url prntver = soup.find('li', attrs={'class':'print tracked'})
rurl = url.replace('http://www.businessweek.com/','http://www.businessweek.com/print/') rurl = prntver.find('a', href=True)['href']
return rurl.replace('/investing/','/investor/') return rurl
def preprocess_html(self, soup):
for item in soup.findAll(style=True):
del item['style']
for alink in soup.findAll('a'):
if alink.string is not None:
tstr = alink.string
alink.replaceWith(tstr)
return soup

View File

@ -1,33 +1,23 @@
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai # vim:fileencoding=UTF-8
from __future__ import unicode_literals
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
class AListApart (BasicNewsRecipe): class AListApart (BasicNewsRecipe):
__author__ = u'Marc Busqué <marc@lamarciana.com>' __author__ = 'Marc Busqué <marc@lamarciana.com>'
__url__ = 'http://www.lamarciana.com' __url__ = 'http://www.lamarciana.com'
__version__ = '1.0' __version__ = '2.0'
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = u'2012, Marc Busqué <marc@lamarciana.com>' __copyright__ = '2012, Marc Busqué <marc@lamarciana.com>'
title = u'A List Apart' title = u'A List Apart'
description = u'A List Apart Magazine (ISSN: 1534-0295) explores the design, development, and meaning of web content, with a special focus on web standards and best practices.' description = u'A List Apart Magazine (ISSN: 1534-0295) explores the design, development, and meaning of web content, with a special focus on web standards and best practices. This recipe retrieve articles and columns.'
language = 'en' language = 'en'
tags = 'web development, software' tags = 'web development, software'
oldest_article = 120 oldest_article = 120
remove_empty_feeds = True remove_empty_feeds = True
no_stylesheets = True
encoding = 'utf8' encoding = 'utf8'
cover_url = u'http://alistapart.com/pix/alalogo.gif' cover_url = u'http://alistapart.com/pix/alalogo.gif'
keep_only_tags = [ extra_css = u'img {max-width: 100%; display: block; margin: auto;}'
dict(name='div', attrs={'id': 'content'})
]
remove_tags = [
dict(name='ul', attrs={'id': 'metastuff'}),
dict(name='div', attrs={'class': 'discuss'}),
dict(name='div', attrs={'class': 'discuss'}),
dict(name='div', attrs={'id': 'learnmore'}),
]
remove_attributes = ['border', 'cellspacing', 'align', 'cellpadding', 'colspan', 'valign', 'vspace', 'hspace', 'alt', 'width', 'height']
extra_css = u'img {max-width: 100%; display: block; margin: auto;} #authorbio img {float: left; margin-right: 2%;}'
feeds = [ feeds = [
(u'A List Apart', u'http://www.alistapart.com/site/rss'), (u'A List Apart', u'http://feeds.feedburner.com/alistapart/abridged'),
] ]

View File

@ -1,30 +1,30 @@
# vim:fileencoding=UTF-8
from __future__ import unicode_literals
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2011, Eddie Lau' __copyright__ = '2011-2013, Eddie Lau'
# data source: normal, mobile # data source: normal, mobile
__Source__ = 'mobile' __Source__ = 'mobile'
# please replace the following "True" with "False". (Default: True) # please replace the following "True" with "False". (Default: True)
__MakePeriodical__ = True __MakePeriodical__ = True
# Turn below to True if your device supports display of CJK titles (Default: False) # Turn below to True if your device supports display of CJK titles (Default: False)
__UseChineseTitle__ = False __UseChineseTitle__ = True
# Set it to False if you want to skip images (Default: True) # Set it to False if you want to skip images (Default: True)
__KeepImages__ = True __KeepImages__ = True
# Set it to True if you want to include a summary in Kindle's article view (Default: False) # Set it to True if you want to include a summary in Kindle's article view (Default: False)
__IncludeSummary__ = False __IncludeSummary__ = True
# Set it to True if you want thumbnail images in Kindle's article view (Default: True) # Set it to True if you want thumbnail images in Kindle's article view (Default: True)
__IncludeThumbnails__ = True __IncludeThumbnails__ = True
''' '''
Change Log: Change Log:
2013/03/31 -- fix cover retrieval code and heading size, and remove &nbsp; in summary
2011/12/29 -- first version done 2011/12/29 -- first version done
TODO:
* use alternative source at http://m.singtao.com/index.php
''' '''
from calibre.utils.date import now as nowf from calibre.utils.date import now as nowf
import os, datetime, re import os, datetime, re
from datetime import date
from calibre.web.feeds.recipes import BasicNewsRecipe from calibre.web.feeds.recipes import BasicNewsRecipe
from contextlib import nested from contextlib import nested
from calibre.ebooks.BeautifulSoup import BeautifulSoup from calibre.ebooks.BeautifulSoup import BeautifulSoup
@ -41,7 +41,7 @@ class STHKRecipe(BasicNewsRecipe):
title = 'Sing Tao Daily - Hong Kong' title = 'Sing Tao Daily - Hong Kong'
description = 'Hong Kong Chinese Newspaper (http://singtao.com)' description = 'Hong Kong Chinese Newspaper (http://singtao.com)'
category = 'Chinese, News, Hong Kong' category = 'Chinese, News, Hong Kong'
extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px; max-height:90%;} td[class=caption] {font-size:50%;} td[class=bodyhead]{font-weight:bold; font-size:150%;} td[class=stmobheadline]{font-weight:bold; font-size:150%;}' extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px; max-height:90%;} td[class=caption] {font-size:50%;} td[class=bodyhead]{font-weight:bold; font-size:150%;} td[class=stmobheadline]{font-weight:bold; font-size:200%;}'
masthead_url = 'http://upload.wikimedia.org/wikipedia/en/d/dd/Singtao-usa.png' masthead_url = 'http://upload.wikimedia.org/wikipedia/en/d/dd/Singtao-usa.png'
if __Source__ == 'normal': if __Source__ == 'normal':
keep_only_tags = [dict(name='td', attrs={'class':['bodyhead','bodytext']})] keep_only_tags = [dict(name='td', attrs={'class':['bodyhead','bodytext']})]
@ -96,17 +96,13 @@ class STHKRecipe(BasicNewsRecipe):
return self.get_dtlocal().strftime("%d") return self.get_dtlocal().strftime("%d")
def get_cover_url(self): def get_cover_url(self):
#cover = 'http://singtao.com/media/a/a(2660).jpg' # for 2011/12/29 soup = self.index_to_soup('http://m.singtao.com/')
base = 2660 cover = soup.find(attrs={'class':'special'}).get('src', False)
todaydate = date(int(self.get_fetchyear()), int(self.get_fetchmonth()), int(self.get_fetchday()))
diff = todaydate - date(2011, 12, 29)
base = base + int(diff.total_seconds()/(3600*24))
cover = 'http://singtao.com/media/a/a(' + str(base) +').jpg'
br = BasicNewsRecipe.get_browser(self) br = BasicNewsRecipe.get_browser(self)
try: try:
br.open(cover) br.open(cover)
except: except:
cover = 'http://singtao.com/images/stlogo.gif' cover = None
return cover return cover
def parse_index(self): def parse_index(self):
@ -293,7 +289,7 @@ class STHKRecipe(BasicNewsRecipe):
textFound = False textFound = False
for p in paras: for p in paras:
if not textFound: if not textFound:
summary_candidate = self.tag_to_string(p).strip() summary_candidate = self.tag_to_string(p).strip().replace('&nbsp;', '')
if len(summary_candidate) > 0: if len(summary_candidate) > 0:
summary_candidate = summary_candidate.replace(u'(\u661f\u5cf6\u65e5\u5831\u5831\u9053)', '', 1) summary_candidate = summary_candidate.replace(u'(\u661f\u5cf6\u65e5\u5831\u5831\u9053)', '', 1)
article.summary = article.text_summary = summary_candidate article.summary = article.text_summary = summary_candidate
@ -489,3 +485,4 @@ class STHKRecipe(BasicNewsRecipe):

View File

@ -47,6 +47,10 @@ binary_includes = [
'/usr/lib/libgthread-2.0.so.0', '/usr/lib/libgthread-2.0.so.0',
'/usr/lib/libpng14.so.14', '/usr/lib/libpng14.so.14',
'/usr/lib/libexslt.so.0', '/usr/lib/libexslt.so.0',
# Ensure that libimobiledevice is compiled against openssl, not gnutls
'/usr/lib/libimobiledevice.so.3',
'/usr/lib/libusbmuxd.so.2',
'/usr/lib/libplist.so.1',
MAGICK_PREFIX+'/lib/libMagickWand.so.5', MAGICK_PREFIX+'/lib/libMagickWand.so.5',
MAGICK_PREFIX+'/lib/libMagickCore.so.5', MAGICK_PREFIX+'/lib/libMagickCore.so.5',
'/usr/lib/libgcrypt.so.11', '/usr/lib/libgcrypt.so.11',

View File

@ -399,7 +399,8 @@ class Py2App(object):
@flush @flush
def add_fontconfig(self): def add_fontconfig(self):
info('\nAdding fontconfig') info('\nAdding fontconfig')
for x in ('fontconfig.1', 'freetype.6', 'expat.1'): for x in ('fontconfig.1', 'freetype.6', 'expat.1',
'plist.1', 'usbmuxd.2', 'imobiledevice.3'):
src = os.path.join(SW, 'lib', 'lib'+x+'.dylib') src = os.path.join(SW, 'lib', 'lib'+x+'.dylib')
self.install_dylib(src) self.install_dylib(src)
dst = os.path.join(self.resources_dir, 'fonts') dst = os.path.join(self.resources_dir, 'fonts')

View File

@ -262,6 +262,35 @@ def from_links(container):
toc.remove(child) toc.remove(child)
return toc return toc
def find_text(node):
LIMIT = 200
pat = re.compile(r'\s+')
for child in node:
if isinstance(child, etree._Element):
text = xml2text(child).strip()
text = pat.sub(' ', text)
if len(text) < 1:
continue
if len(text) > LIMIT:
# Look for less text in a child of this node, recursively
ntext = find_text(child)
return ntext or (text[:LIMIT] + '...')
else:
return text
def from_files(container):
toc = TOC()
for spinepath in container.spine_items:
name = container.abspath_to_name(spinepath)
root = container.parsed(name)
body = XPath('//h:body')(root)
if not body:
continue
text = find_text(body[0])
if text:
toc.add(text, name)
return toc
def add_id(container, name, loc): def add_id(container, name, loc):
root = container.parsed(name) root = container.parsed(name)
body = root.xpath('//*[local-name()="body"]')[0] body = root.xpath('//*[local-name()="body"]')[0]

View File

@ -333,8 +333,8 @@ class OEBReader(object):
guide = self.oeb.guide guide = self.oeb.guide
manifest = self.oeb.manifest manifest = self.oeb.manifest
for elem in xpath(opf, '/o2:package/o2:guide/o2:reference'): for elem in xpath(opf, '/o2:package/o2:guide/o2:reference'):
href = elem.get('href') ref_href = elem.get('href')
path = urlnormalize(urldefrag(href)[0]) path = urlnormalize(urldefrag(ref_href)[0])
if path not in manifest.hrefs: if path not in manifest.hrefs:
corrected_href = None corrected_href = None
for href in manifest.hrefs: for href in manifest.hrefs:
@ -342,12 +342,12 @@ class OEBReader(object):
corrected_href = href corrected_href = href
break break
if corrected_href is None: if corrected_href is None:
self.logger.warn(u'Guide reference %r not found' % href) self.logger.warn(u'Guide reference %r not found' % ref_href)
continue continue
href = corrected_href ref_href = corrected_href
typ = elem.get('type') typ = elem.get('type')
if typ not in guide: if typ not in guide:
guide.add(typ, elem.get('title'), href) guide.add(typ, elem.get('title'), ref_href)
def _find_ncx(self, opf): def _find_ncx(self, opf):
result = xpath(opf, '/o2:package/o2:spine/@toc') result = xpath(opf, '/o2:package/o2:spine/@toc')

View File

@ -18,7 +18,7 @@ from PyQt4.Qt import (QPushButton, QFrame, QVariant, QMenu, QInputDialog,
from calibre.ebooks.oeb.polish.container import get_container, AZW3Container from calibre.ebooks.oeb.polish.container import get_container, AZW3Container
from calibre.ebooks.oeb.polish.toc import ( from calibre.ebooks.oeb.polish.toc import (
get_toc, add_id, TOC, commit_toc, from_xpaths, from_links) get_toc, add_id, TOC, commit_toc, from_xpaths, from_links, from_files)
from calibre.gui2 import Application, error_dialog, gprefs from calibre.gui2 import Application, error_dialog, gprefs
from calibre.gui2.progress_indicator import ProgressIndicator from calibre.gui2.progress_indicator import ProgressIndicator
from calibre.gui2.toc.location import ItemEdit from calibre.gui2.toc.location import ItemEdit
@ -126,6 +126,7 @@ class ItemView(QFrame): # {{{
go_to_root = pyqtSignal() go_to_root = pyqtSignal()
create_from_xpath = pyqtSignal(object) create_from_xpath = pyqtSignal(object)
create_from_links = pyqtSignal() create_from_links = pyqtSignal()
create_from_files = pyqtSignal()
flatten_toc = pyqtSignal() flatten_toc = pyqtSignal()
def __init__(self, parent): def __init__(self, parent):
@ -183,6 +184,15 @@ class ItemView(QFrame): # {{{
))) )))
l.addWidget(b) l.addWidget(b)
self.cfb = b = QPushButton(_('Generate ToC from &files'))
b.clicked.connect(self.create_from_files)
b.setToolTip(textwrap.fill(_(
'Generate a Table of Contents from individual files in the book.'
' Each entry in the ToC will point to the start of the file, the'
' text of the entry will be the "first line" of text from the file.'
)))
l.addWidget(b)
self.xpb = b = QPushButton(_('Generate ToC from &XPath')) self.xpb = b = QPushButton(_('Generate ToC from &XPath'))
b.clicked.connect(self.create_from_user_xpath) b.clicked.connect(self.create_from_user_xpath)
b.setToolTip(textwrap.fill(_( b.setToolTip(textwrap.fill(_(
@ -577,6 +587,7 @@ class TOCView(QWidget): # {{{
i.add_new_item.connect(self.add_new_item) i.add_new_item.connect(self.add_new_item)
i.create_from_xpath.connect(self.create_from_xpath) i.create_from_xpath.connect(self.create_from_xpath)
i.create_from_links.connect(self.create_from_links) i.create_from_links.connect(self.create_from_links)
i.create_from_files.connect(self.create_from_files)
i.flatten_item.connect(self.flatten_item) i.flatten_item.connect(self.flatten_item)
i.flatten_toc.connect(self.flatten_toc) i.flatten_toc.connect(self.flatten_toc)
i.go_to_root.connect(self.go_to_root) i.go_to_root.connect(self.go_to_root)
@ -778,6 +789,14 @@ class TOCView(QWidget): # {{{
_('No links were found that could be added to the Table of Contents.'), show=True) _('No links were found that could be added to the Table of Contents.'), show=True)
self.insert_toc_fragment(toc) self.insert_toc_fragment(toc)
def create_from_files(self):
toc = from_files(self.ebook)
if len(toc) == 0:
return error_dialog(self, _('No items found'),
_('No files were found that could be added to the Table of Contents.'), show=True)
self.insert_toc_fragment(toc)
# }}} # }}}
class TOCEditor(QDialog): # {{{ class TOCEditor(QDialog): # {{{

View File

@ -22507,7 +22507,7 @@ msgstr "Autoren beginnend mit '%s'"
#: /home/kovid/work/calibre/src/calibre/library/catalogs/epub_mobi_builder.py:3477 #: /home/kovid/work/calibre/src/calibre/library/catalogs/epub_mobi_builder.py:3477
#, python-format #, python-format
msgid "Authors beginning with '%s'" msgid "Authors beginning with '%s'"
msgstr "Autoren beginnen mit mit %s" msgstr "Autoren beginnen mit %s"
#: /home/kovid/work/calibre/src/calibre/library/catalogs/epub_mobi_builder.py:3518 #: /home/kovid/work/calibre/src/calibre/library/catalogs/epub_mobi_builder.py:3518
msgid "NCX for Recently Added" msgid "NCX for Recently Added"