mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-08 02:34:06 -04:00
Merge from trunk
This commit is contained in:
commit
22a1422ec6
290
recipes/am730.recipe
Normal file
290
recipes/am730.recipe
Normal file
@ -0,0 +1,290 @@
|
|||||||
|
# vim:fileencoding=UTF-8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
__license__ = 'GPL v3'
|
||||||
|
__copyright__ = '2013, Eddie Lau'
|
||||||
|
__Date__ = ''
|
||||||
|
__HiResImg__ = True
|
||||||
|
|
||||||
|
'''
|
||||||
|
Change Log:
|
||||||
|
2013/03/30 -- first version
|
||||||
|
'''
|
||||||
|
|
||||||
|
from calibre import (__appname__, force_unicode, strftime)
|
||||||
|
from calibre.utils.date import now as nowf
|
||||||
|
import os, datetime, re
|
||||||
|
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||||
|
from contextlib import nested
|
||||||
|
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag
|
||||||
|
from calibre.ebooks.metadata.opf2 import OPFCreator
|
||||||
|
from calibre.ebooks.metadata.toc import TOC
|
||||||
|
from calibre.ebooks.metadata import MetaInformation
|
||||||
|
from calibre.utils.localization import canonicalize_lang
|
||||||
|
|
||||||
|
class AppleDaily(BasicNewsRecipe):
|
||||||
|
title = u'AM730'
|
||||||
|
__author__ = 'Eddie Lau'
|
||||||
|
publisher = 'AM730'
|
||||||
|
oldest_article = 1
|
||||||
|
max_articles_per_feed = 100
|
||||||
|
auto_cleanup = False
|
||||||
|
language = 'zh'
|
||||||
|
encoding = 'utf-8'
|
||||||
|
auto_cleanup = False
|
||||||
|
remove_javascript = True
|
||||||
|
use_embedded_content = False
|
||||||
|
no_stylesheets = True
|
||||||
|
description = 'http://www.am730.com.hk'
|
||||||
|
category = 'Chinese, News, Hong Kong'
|
||||||
|
masthead_url = 'http://www.am730.com.hk/images/logo.jpg'
|
||||||
|
|
||||||
|
extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px; max-height:90%;} div[id=articleHeader] {font-size:200%; text-align:left; font-weight:bold;} photocaption {font-size:50%; margin-left:auto; margin-right:auto;}'
|
||||||
|
keep_only_tags = [dict(name='div', attrs={'id':'articleHeader'}),
|
||||||
|
dict(name='div', attrs={'class':'thecontent wordsnap'}),
|
||||||
|
dict(name='a', attrs={'class':'lightboximg'})]
|
||||||
|
remove_tags = [dict(name='img', attrs={'src':'/images/am730_article_logo.jpg'}),
|
||||||
|
dict(name='img', attrs={'src':'/images/am_endmark.gif'})]
|
||||||
|
|
||||||
|
def get_dtlocal(self):
|
||||||
|
dt_utc = datetime.datetime.utcnow()
|
||||||
|
# convert UTC to local hk time - at HKT 6am, all news are available
|
||||||
|
return dt_utc + datetime.timedelta(8.0/24) - datetime.timedelta(6.0/24)
|
||||||
|
|
||||||
|
def get_fetchdate(self):
|
||||||
|
if __Date__ <> '':
|
||||||
|
return __Date__
|
||||||
|
else:
|
||||||
|
return self.get_dtlocal().strftime("%Y%m%d")
|
||||||
|
|
||||||
|
def get_fetchformatteddate(self):
|
||||||
|
if __Date__ <> '':
|
||||||
|
return __Date__[0:4]+'-'+__Date__[4:6]+'-'+__Date__[6:8]
|
||||||
|
else:
|
||||||
|
return self.get_dtlocal().strftime("%Y-%m-%d")
|
||||||
|
|
||||||
|
def get_fetchyear(self):
|
||||||
|
if __Date__ <> '':
|
||||||
|
return __Date__[0:4]
|
||||||
|
else:
|
||||||
|
return self.get_dtlocal().strftime("%Y")
|
||||||
|
|
||||||
|
def get_fetchmonth(self):
|
||||||
|
if __Date__ <> '':
|
||||||
|
return __Date__[4:6]
|
||||||
|
else:
|
||||||
|
return self.get_dtlocal().strftime("%m")
|
||||||
|
|
||||||
|
def get_fetchday(self):
|
||||||
|
if __Date__ <> '':
|
||||||
|
return __Date__[6:8]
|
||||||
|
else:
|
||||||
|
return self.get_dtlocal().strftime("%d")
|
||||||
|
|
||||||
|
# Note: does not work with custom date given by __Date__
|
||||||
|
def get_weekday(self):
|
||||||
|
return self.get_dtlocal().weekday()
|
||||||
|
|
||||||
|
def populate_article_metadata(self, article, soup, first):
|
||||||
|
if first and hasattr(self, 'add_toc_thumbnail'):
|
||||||
|
picdiv = soup.find('img')
|
||||||
|
if picdiv is not None:
|
||||||
|
self.add_toc_thumbnail(article,picdiv['src'])
|
||||||
|
|
||||||
|
def parse_index(self):
|
||||||
|
feeds = []
|
||||||
|
soup = self.index_to_soup('http://www.am730.com.hk/')
|
||||||
|
ul = soup.find(attrs={'class':'nav-section'})
|
||||||
|
sectionList = []
|
||||||
|
for li in ul.findAll('li'):
|
||||||
|
a = 'http://www.am730.com.hk/' + li.find('a', href=True).get('href', False)
|
||||||
|
title = li.find('a').get('title', False).strip()
|
||||||
|
sectionList.append((title, a))
|
||||||
|
for title, url in sectionList:
|
||||||
|
articles = self.parse_section(url)
|
||||||
|
if articles:
|
||||||
|
feeds.append((title, articles))
|
||||||
|
return feeds
|
||||||
|
|
||||||
|
def parse_section(self, url):
|
||||||
|
soup = self.index_to_soup(url)
|
||||||
|
items = soup.findAll(attrs={'style':'padding-bottom: 15px;'})
|
||||||
|
current_articles = []
|
||||||
|
for item in items:
|
||||||
|
a = item.find(attrs={'class':'t6 f14'}).find('a', href=True)
|
||||||
|
articlelink = 'http://www.am730.com.hk/' + a.get('href', True)
|
||||||
|
title = self.tag_to_string(a)
|
||||||
|
description = self.tag_to_string(item.find(attrs={'class':'t3 f14'}))
|
||||||
|
current_articles.append({'title': title, 'url': articlelink, 'description': description})
|
||||||
|
return current_articles
|
||||||
|
|
||||||
|
def preprocess_html(self, soup):
|
||||||
|
multia = soup.findAll('a')
|
||||||
|
for a in multia:
|
||||||
|
if not (a == None):
|
||||||
|
image = a.find('img')
|
||||||
|
if not (image == None):
|
||||||
|
if __HiResImg__:
|
||||||
|
image['src'] = image.get('src').replace('/thumbs/', '/')
|
||||||
|
caption = image.get('alt')
|
||||||
|
tag = Tag(soup, "photo", [])
|
||||||
|
tag2 = Tag(soup, "photocaption", [])
|
||||||
|
tag.insert(0, image)
|
||||||
|
if not caption == None:
|
||||||
|
tag2.insert(0, caption)
|
||||||
|
tag.insert(1, tag2)
|
||||||
|
a.replaceWith(tag)
|
||||||
|
return soup
|
||||||
|
|
||||||
|
def create_opf(self, feeds, dir=None):
|
||||||
|
if dir is None:
|
||||||
|
dir = self.output_dir
|
||||||
|
title = self.short_title()
|
||||||
|
if self.output_profile.periodical_date_in_title:
|
||||||
|
title += strftime(self.timefmt)
|
||||||
|
mi = MetaInformation(title, [__appname__])
|
||||||
|
mi.publisher = __appname__
|
||||||
|
mi.author_sort = __appname__
|
||||||
|
if self.publication_type:
|
||||||
|
mi.publication_type = 'periodical:'+self.publication_type+':'+self.short_title()
|
||||||
|
mi.timestamp = nowf()
|
||||||
|
article_titles, aseen = [], set()
|
||||||
|
for f in feeds:
|
||||||
|
for a in f:
|
||||||
|
if a.title and a.title not in aseen:
|
||||||
|
aseen.add(a.title)
|
||||||
|
article_titles.append(force_unicode(a.title, 'utf-8'))
|
||||||
|
|
||||||
|
mi.comments = self.description
|
||||||
|
if not isinstance(mi.comments, unicode):
|
||||||
|
mi.comments = mi.comments.decode('utf-8', 'replace')
|
||||||
|
mi.comments += ('\n\n' + _('Articles in this issue: ') + '\n' +
|
||||||
|
'\n\n'.join(article_titles))
|
||||||
|
|
||||||
|
language = canonicalize_lang(self.language)
|
||||||
|
if language is not None:
|
||||||
|
mi.language = language
|
||||||
|
# This one affects the pub date shown in kindle title
|
||||||
|
#mi.pubdate = nowf()
|
||||||
|
# now appears to need the time field to be > 12.00noon as well
|
||||||
|
mi.pubdate = datetime.datetime(int(self.get_fetchyear()), int(self.get_fetchmonth()), int(self.get_fetchday()), 12, 30, 0)
|
||||||
|
opf_path = os.path.join(dir, 'index.opf')
|
||||||
|
ncx_path = os.path.join(dir, 'index.ncx')
|
||||||
|
|
||||||
|
opf = OPFCreator(dir, mi)
|
||||||
|
# Add mastheadImage entry to <guide> section
|
||||||
|
mp = getattr(self, 'masthead_path', None)
|
||||||
|
if mp is not None and os.access(mp, os.R_OK):
|
||||||
|
from calibre.ebooks.metadata.opf2 import Guide
|
||||||
|
ref = Guide.Reference(os.path.basename(self.masthead_path), os.getcwdu())
|
||||||
|
ref.type = 'masthead'
|
||||||
|
ref.title = 'Masthead Image'
|
||||||
|
opf.guide.append(ref)
|
||||||
|
|
||||||
|
manifest = [os.path.join(dir, 'feed_%d'%i) for i in range(len(feeds))]
|
||||||
|
manifest.append(os.path.join(dir, 'index.html'))
|
||||||
|
manifest.append(os.path.join(dir, 'index.ncx'))
|
||||||
|
|
||||||
|
# Get cover
|
||||||
|
cpath = getattr(self, 'cover_path', None)
|
||||||
|
if cpath is None:
|
||||||
|
pf = open(os.path.join(dir, 'cover.jpg'), 'wb')
|
||||||
|
if self.default_cover(pf):
|
||||||
|
cpath = pf.name
|
||||||
|
if cpath is not None and os.access(cpath, os.R_OK):
|
||||||
|
opf.cover = cpath
|
||||||
|
manifest.append(cpath)
|
||||||
|
|
||||||
|
# Get masthead
|
||||||
|
mpath = getattr(self, 'masthead_path', None)
|
||||||
|
if mpath is not None and os.access(mpath, os.R_OK):
|
||||||
|
manifest.append(mpath)
|
||||||
|
|
||||||
|
opf.create_manifest_from_files_in(manifest)
|
||||||
|
for mani in opf.manifest:
|
||||||
|
if mani.path.endswith('.ncx'):
|
||||||
|
mani.id = 'ncx'
|
||||||
|
if mani.path.endswith('mastheadImage.jpg'):
|
||||||
|
mani.id = 'masthead-image'
|
||||||
|
|
||||||
|
entries = ['index.html']
|
||||||
|
toc = TOC(base_path=dir)
|
||||||
|
self.play_order_counter = 0
|
||||||
|
self.play_order_map = {}
|
||||||
|
|
||||||
|
|
||||||
|
def feed_index(num, parent):
|
||||||
|
f = feeds[num]
|
||||||
|
for j, a in enumerate(f):
|
||||||
|
if getattr(a, 'downloaded', False):
|
||||||
|
adir = 'feed_%d/article_%d/'%(num, j)
|
||||||
|
auth = a.author
|
||||||
|
if not auth:
|
||||||
|
auth = None
|
||||||
|
desc = a.text_summary
|
||||||
|
if not desc:
|
||||||
|
desc = None
|
||||||
|
else:
|
||||||
|
desc = self.description_limiter(desc)
|
||||||
|
tt = a.toc_thumbnail if a.toc_thumbnail else None
|
||||||
|
entries.append('%sindex.html'%adir)
|
||||||
|
po = self.play_order_map.get(entries[-1], None)
|
||||||
|
if po is None:
|
||||||
|
self.play_order_counter += 1
|
||||||
|
po = self.play_order_counter
|
||||||
|
parent.add_item('%sindex.html'%adir, None,
|
||||||
|
a.title if a.title else _('Untitled Article'),
|
||||||
|
play_order=po, author=auth,
|
||||||
|
description=desc, toc_thumbnail=tt)
|
||||||
|
last = os.path.join(self.output_dir, ('%sindex.html'%adir).replace('/', os.sep))
|
||||||
|
for sp in a.sub_pages:
|
||||||
|
prefix = os.path.commonprefix([opf_path, sp])
|
||||||
|
relp = sp[len(prefix):]
|
||||||
|
entries.append(relp.replace(os.sep, '/'))
|
||||||
|
last = sp
|
||||||
|
|
||||||
|
if os.path.exists(last):
|
||||||
|
with open(last, 'rb') as fi:
|
||||||
|
src = fi.read().decode('utf-8')
|
||||||
|
soup = BeautifulSoup(src)
|
||||||
|
body = soup.find('body')
|
||||||
|
if body is not None:
|
||||||
|
prefix = '/'.join('..'for i in range(2*len(re.findall(r'link\d+', last))))
|
||||||
|
templ = self.navbar.generate(True, num, j, len(f),
|
||||||
|
not self.has_single_feed,
|
||||||
|
a.orig_url, __appname__, prefix=prefix,
|
||||||
|
center=self.center_navbar)
|
||||||
|
elem = BeautifulSoup(templ.render(doctype='xhtml').decode('utf-8')).find('div')
|
||||||
|
body.insert(len(body.contents), elem)
|
||||||
|
with open(last, 'wb') as fi:
|
||||||
|
fi.write(unicode(soup).encode('utf-8'))
|
||||||
|
if len(feeds) == 0:
|
||||||
|
raise Exception('All feeds are empty, aborting.')
|
||||||
|
|
||||||
|
if len(feeds) > 1:
|
||||||
|
for i, f in enumerate(feeds):
|
||||||
|
entries.append('feed_%d/index.html'%i)
|
||||||
|
po = self.play_order_map.get(entries[-1], None)
|
||||||
|
if po is None:
|
||||||
|
self.play_order_counter += 1
|
||||||
|
po = self.play_order_counter
|
||||||
|
auth = getattr(f, 'author', None)
|
||||||
|
if not auth:
|
||||||
|
auth = None
|
||||||
|
desc = getattr(f, 'description', None)
|
||||||
|
if not desc:
|
||||||
|
desc = None
|
||||||
|
feed_index(i, toc.add_item('feed_%d/index.html'%i, None,
|
||||||
|
f.title, play_order=po, description=desc, author=auth))
|
||||||
|
|
||||||
|
else:
|
||||||
|
entries.append('feed_%d/index.html'%0)
|
||||||
|
feed_index(0, toc)
|
||||||
|
|
||||||
|
for i, p in enumerate(entries):
|
||||||
|
entries[i] = os.path.join(dir, p.replace('/', os.sep))
|
||||||
|
opf.create_spine(entries)
|
||||||
|
opf.set_toc(toc)
|
||||||
|
|
||||||
|
with nested(open(opf_path, 'wb'), open(ncx_path, 'wb')) as (opf_file, ncx_file):
|
||||||
|
opf.render(opf_file, ncx_file)
|
||||||
|
|
@ -1,161 +1,275 @@
|
|||||||
# -*- coding: utf-8 -*-
|
# vim:fileencoding=UTF-8
|
||||||
import re
|
from __future__ import unicode_literals
|
||||||
|
__license__ = 'GPL v3'
|
||||||
|
__copyright__ = '2013, Eddie Lau'
|
||||||
|
__Date__ = ''
|
||||||
|
|
||||||
|
from calibre import (__appname__, force_unicode, strftime)
|
||||||
|
from calibre.utils.date import now as nowf
|
||||||
|
import os, datetime, re
|
||||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||||
|
from contextlib import nested
|
||||||
|
from calibre.ebooks.BeautifulSoup import BeautifulSoup
|
||||||
|
from calibre.ebooks.metadata.opf2 import OPFCreator
|
||||||
|
from calibre.ebooks.metadata.toc import TOC
|
||||||
|
from calibre.ebooks.metadata import MetaInformation
|
||||||
|
from calibre.utils.localization import canonicalize_lang
|
||||||
|
|
||||||
class AppleDaily(BasicNewsRecipe):
|
class AppleDaily(BasicNewsRecipe):
|
||||||
|
title = u'蘋果日報 (香港)'
|
||||||
title = u'蘋果日報'
|
__author__ = 'Eddie Lau'
|
||||||
__author__ = u'蘋果日報'
|
publisher = '蘋果日報'
|
||||||
__publisher__ = u'蘋果日報'
|
oldest_article = 1
|
||||||
description = u'蘋果日報'
|
max_articles_per_feed = 100
|
||||||
masthead_url = 'http://hk.apple.nextmedia.com/template/common/header/2009/images/atnextheader_logo_appledaily.gif'
|
auto_cleanup = False
|
||||||
language = 'zh_TW'
|
language = 'zh'
|
||||||
encoding = 'UTF-8'
|
encoding = 'utf-8'
|
||||||
timefmt = ' [%a, %d %b, %Y]'
|
auto_cleanup = False
|
||||||
needs_subscription = False
|
|
||||||
remove_javascript = True
|
remove_javascript = True
|
||||||
remove_tags_before = dict(name=['ul', 'h1'])
|
use_embedded_content = False
|
||||||
remove_tags_after = dict(name='form')
|
|
||||||
remove_tags = [dict(attrs={'class':['articleTools', 'post-tools', 'side_tool', 'nextArticleLink clearfix']}),
|
|
||||||
dict(id=['footer', 'toolsRight', 'articleInline', 'navigation', 'archive', 'side_search', 'blog_sidebar', 'side_tool', 'side_index']),
|
|
||||||
dict(name=['script', 'noscript', 'style', 'form'])]
|
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
extra_css = '''
|
description = 'http://hkm.appledaily.com/'
|
||||||
@font-face {font-family: "uming", serif, sans-serif; src: url(res:///usr/share/fonts/truetype/arphic/uming.ttc); }\n
|
category = 'Chinese, News, Hong Kong'
|
||||||
body {margin-right: 8pt; font-family: 'uming', serif;}
|
masthead_url = 'http://upload.wikimedia.org/wikipedia/zh/c/cf/AppleDailyLogo1.png'
|
||||||
h1 {font-family: 'uming', serif, sans-serif}
|
|
||||||
'''
|
|
||||||
#extra_css = 'h1 {font: sans-serif large;}\n.byline {font:monospace;}'
|
|
||||||
|
|
||||||
preprocess_regexps = [
|
extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px; max-height:90%;} h1 {font-size:200%; text-align:left; font-weight:bold;} p[class=video-caption] {font-size:50%; margin-left:auto; margin-right:auto;}'
|
||||||
(re.compile(r'img.php?server=(?P<server>[^&]+)&path=(?P<path>[^&]+).*', re.DOTALL|re.IGNORECASE),
|
keep_only_tags = [dict(name='div', attrs={'id':'content-article'})]
|
||||||
lambda match: 'http://' + match.group('server') + '/' + match.group('path')),
|
remove_tags = [dict(name='div', attrs={'class':'prev-next-btn'}),
|
||||||
]
|
dict(name='p', attrs={'class':'next'})]
|
||||||
|
|
||||||
|
def get_dtlocal(self):
|
||||||
|
dt_utc = datetime.datetime.utcnow()
|
||||||
|
# convert UTC to local hk time - at HKT 6am, all news are available
|
||||||
|
return dt_utc + datetime.timedelta(8.0/24) - datetime.timedelta(6.0/24)
|
||||||
|
|
||||||
|
def get_fetchdate(self):
|
||||||
|
if __Date__ <> '':
|
||||||
|
return __Date__
|
||||||
|
else:
|
||||||
|
return self.get_dtlocal().strftime("%Y%m%d")
|
||||||
|
|
||||||
|
def get_fetchformatteddate(self):
|
||||||
|
if __Date__ <> '':
|
||||||
|
return __Date__[0:4]+'-'+__Date__[4:6]+'-'+__Date__[6:8]
|
||||||
|
else:
|
||||||
|
return self.get_dtlocal().strftime("%Y-%m-%d")
|
||||||
|
|
||||||
|
def get_fetchyear(self):
|
||||||
|
if __Date__ <> '':
|
||||||
|
return __Date__[0:4]
|
||||||
|
else:
|
||||||
|
return self.get_dtlocal().strftime("%Y")
|
||||||
|
|
||||||
|
def get_fetchmonth(self):
|
||||||
|
if __Date__ <> '':
|
||||||
|
return __Date__[4:6]
|
||||||
|
else:
|
||||||
|
return self.get_dtlocal().strftime("%m")
|
||||||
|
|
||||||
|
def get_fetchday(self):
|
||||||
|
if __Date__ <> '':
|
||||||
|
return __Date__[6:8]
|
||||||
|
else:
|
||||||
|
return self.get_dtlocal().strftime("%d")
|
||||||
|
|
||||||
|
# Note: does not work with custom date given by __Date__
|
||||||
|
def get_weekday(self):
|
||||||
|
return self.get_dtlocal().weekday()
|
||||||
|
|
||||||
def get_cover_url(self):
|
def get_cover_url(self):
|
||||||
return 'http://hk.apple.nextmedia.com/template/common/header/2009/images/atnextheader_logo_appledaily.gif'
|
soup = self.index_to_soup('http://hkm.appledaily.com/')
|
||||||
|
cover = soup.find(attrs={'class':'top-news'}).get('src', False)
|
||||||
|
br = BasicNewsRecipe.get_browser(self)
|
||||||
#def get_browser(self):
|
try:
|
||||||
#br = BasicNewsRecipe.get_browser(self)
|
br.open(cover)
|
||||||
#if self.username is not None and self.password is not None:
|
except:
|
||||||
# br.open('http://www.nytimes.com/auth/login')
|
cover = None
|
||||||
# br.select_form(name='login')
|
return cover
|
||||||
# br['USERID'] = self.username
|
|
||||||
# br['PASSWORD'] = self.password
|
|
||||||
# br.submit()
|
|
||||||
#return br
|
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
|
||||||
#process all the images
|
|
||||||
for tag in soup.findAll(lambda tag: tag.name.lower()=='img' and tag.has_key('src')):
|
|
||||||
iurl = tag['src']
|
|
||||||
#print 'checking image: ' + iurl
|
|
||||||
|
|
||||||
#img\.php?server\=(?P<server>[^&]+)&path=(?P<path>[^&]+)
|
|
||||||
p = re.compile(r'img\.php\?server=(?P<server>[^&]+)&path=(?P<path>[^&]+)', re.DOTALL|re.IGNORECASE)
|
|
||||||
|
|
||||||
m = p.search(iurl)
|
|
||||||
|
|
||||||
if m is not None:
|
|
||||||
iurl = 'http://' + m.group('server') + '/' + m.group('path')
|
|
||||||
#print 'working! new url: ' + iurl
|
|
||||||
tag['src'] = iurl
|
|
||||||
#else:
|
|
||||||
#print 'not good'
|
|
||||||
|
|
||||||
for tag in soup.findAll(lambda tag: tag.name.lower()=='a' and tag.has_key('href')):
|
|
||||||
iurl = tag['href']
|
|
||||||
#print 'checking image: ' + iurl
|
|
||||||
|
|
||||||
#img\.php?server\=(?P<server>[^&]+)&path=(?P<path>[^&]+)
|
|
||||||
p = re.compile(r'img\.php\?server=(?P<server>[^&]+)&path=(?P<path>[^&]+)', re.DOTALL|re.IGNORECASE)
|
|
||||||
|
|
||||||
m = p.search(iurl)
|
|
||||||
|
|
||||||
if m is not None:
|
|
||||||
iurl = 'http://' + m.group('server') + '/' + m.group('path')
|
|
||||||
#print 'working! new url: ' + iurl
|
|
||||||
tag['href'] = iurl
|
|
||||||
#else:
|
|
||||||
#print 'not good'
|
|
||||||
|
|
||||||
return soup
|
|
||||||
|
|
||||||
|
def populate_article_metadata(self, article, soup, first):
|
||||||
|
if first and hasattr(self, 'add_toc_thumbnail'):
|
||||||
|
picdiv = soup.find('img')
|
||||||
|
if picdiv is not None:
|
||||||
|
self.add_toc_thumbnail(article,picdiv['src'])
|
||||||
|
|
||||||
def parse_index(self):
|
def parse_index(self):
|
||||||
base = 'http://news.hotpot.hk/fruit'
|
feeds = []
|
||||||
soup = self.index_to_soup('http://news.hotpot.hk/fruit/index.php')
|
soup = self.index_to_soup('http://hkm.appledaily.com/')
|
||||||
|
ul = soup.find(attrs={'class':'menu'})
|
||||||
|
sectionList = []
|
||||||
|
for li in ul.findAll('li'):
|
||||||
|
a = 'http://hkm.appledaily.com/' + li.find('a', href=True).get('href', False)
|
||||||
|
title = li.find('a', text=True).strip()
|
||||||
|
if not title == u'動新聞':
|
||||||
|
sectionList.append((title, a))
|
||||||
|
for title, url in sectionList:
|
||||||
|
articles = self.parse_section(url)
|
||||||
|
if articles:
|
||||||
|
feeds.append((title, articles))
|
||||||
|
return feeds
|
||||||
|
|
||||||
#def feed_title(div):
|
def parse_section(self, url):
|
||||||
# return ''.join(div.findAll(text=True, recursive=False)).strip()
|
soup = self.index_to_soup(url)
|
||||||
|
ul = soup.find(attrs={'class':'list'})
|
||||||
|
current_articles = []
|
||||||
|
for li in ul.findAll('li'):
|
||||||
|
a = li.find('a', href=True)
|
||||||
|
title = li.find('p', text=True).strip()
|
||||||
|
if a is not None:
|
||||||
|
current_articles.append({'title': title, 'url':'http://hkm.appledaily.com/' + a.get('href', False)})
|
||||||
|
pass
|
||||||
|
return current_articles
|
||||||
|
|
||||||
articles = {}
|
def create_opf(self, feeds, dir=None):
|
||||||
key = None
|
if dir is None:
|
||||||
ans = []
|
dir = self.output_dir
|
||||||
for div in soup.findAll('li'):
|
title = self.short_title()
|
||||||
key = div.find(text=True, recursive=True);
|
if self.output_profile.periodical_date_in_title:
|
||||||
#if key == u'豪情':
|
title += strftime(self.timefmt)
|
||||||
# continue;
|
mi = MetaInformation(title, [__appname__])
|
||||||
|
mi.publisher = __appname__
|
||||||
|
mi.author_sort = __appname__
|
||||||
|
if self.publication_type:
|
||||||
|
mi.publication_type = 'periodical:'+self.publication_type+':'+self.short_title()
|
||||||
|
mi.timestamp = nowf()
|
||||||
|
article_titles, aseen = [], set()
|
||||||
|
for f in feeds:
|
||||||
|
for a in f:
|
||||||
|
if a.title and a.title not in aseen:
|
||||||
|
aseen.add(a.title)
|
||||||
|
article_titles.append(force_unicode(a.title, 'utf-8'))
|
||||||
|
|
||||||
print 'section=' + key
|
mi.comments = self.description
|
||||||
|
if not isinstance(mi.comments, unicode):
|
||||||
|
mi.comments = mi.comments.decode('utf-8', 'replace')
|
||||||
|
mi.comments += ('\n\n' + _('Articles in this issue: ') + '\n' +
|
||||||
|
'\n\n'.join(article_titles))
|
||||||
|
|
||||||
articles[key] = []
|
language = canonicalize_lang(self.language)
|
||||||
|
if language is not None:
|
||||||
|
mi.language = language
|
||||||
|
# This one affects the pub date shown in kindle title
|
||||||
|
#mi.pubdate = nowf()
|
||||||
|
# now appears to need the time field to be > 12.00noon as well
|
||||||
|
mi.pubdate = datetime.datetime(int(self.get_fetchyear()), int(self.get_fetchmonth()), int(self.get_fetchday()), 12, 30, 0)
|
||||||
|
opf_path = os.path.join(dir, 'index.opf')
|
||||||
|
ncx_path = os.path.join(dir, 'index.ncx')
|
||||||
|
|
||||||
ans.append(key)
|
opf = OPFCreator(dir, mi)
|
||||||
|
# Add mastheadImage entry to <guide> section
|
||||||
|
mp = getattr(self, 'masthead_path', None)
|
||||||
|
if mp is not None and os.access(mp, os.R_OK):
|
||||||
|
from calibre.ebooks.metadata.opf2 import Guide
|
||||||
|
ref = Guide.Reference(os.path.basename(self.masthead_path), os.getcwdu())
|
||||||
|
ref.type = 'masthead'
|
||||||
|
ref.title = 'Masthead Image'
|
||||||
|
opf.guide.append(ref)
|
||||||
|
|
||||||
a = div.find('a', href=True)
|
manifest = [os.path.join(dir, 'feed_%d'%i) for i in range(len(feeds))]
|
||||||
|
manifest.append(os.path.join(dir, 'index.html'))
|
||||||
|
manifest.append(os.path.join(dir, 'index.ncx'))
|
||||||
|
|
||||||
if not a:
|
# Get cover
|
||||||
continue
|
cpath = getattr(self, 'cover_path', None)
|
||||||
|
if cpath is None:
|
||||||
|
pf = open(os.path.join(dir, 'cover.jpg'), 'wb')
|
||||||
|
if self.default_cover(pf):
|
||||||
|
cpath = pf.name
|
||||||
|
if cpath is not None and os.access(cpath, os.R_OK):
|
||||||
|
opf.cover = cpath
|
||||||
|
manifest.append(cpath)
|
||||||
|
|
||||||
url = base + '/' + a['href']
|
# Get masthead
|
||||||
print 'url=' + url
|
mpath = getattr(self, 'masthead_path', None)
|
||||||
|
if mpath is not None and os.access(mpath, os.R_OK):
|
||||||
|
manifest.append(mpath)
|
||||||
|
|
||||||
if not articles.has_key(key):
|
opf.create_manifest_from_files_in(manifest)
|
||||||
articles[key] = []
|
for mani in opf.manifest:
|
||||||
else:
|
if mani.path.endswith('.ncx'):
|
||||||
# sub page
|
mani.id = 'ncx'
|
||||||
subSoup = self.index_to_soup(url)
|
if mani.path.endswith('mastheadImage.jpg'):
|
||||||
|
mani.id = 'masthead-image'
|
||||||
|
|
||||||
for subDiv in subSoup.findAll('li'):
|
entries = ['index.html']
|
||||||
subA = subDiv.find('a', href=True)
|
toc = TOC(base_path=dir)
|
||||||
subTitle = subDiv.find(text=True, recursive=True)
|
self.play_order_counter = 0
|
||||||
subUrl = base + '/' + subA['href']
|
self.play_order_map = {}
|
||||||
|
|
||||||
print 'subUrl' + subUrl
|
|
||||||
|
|
||||||
articles[key].append(
|
|
||||||
dict(title=subTitle,
|
|
||||||
url=subUrl,
|
|
||||||
date='',
|
|
||||||
description='',
|
|
||||||
content=''))
|
|
||||||
|
|
||||||
|
|
||||||
# elif div['class'] in ['story', 'story headline']:
|
def feed_index(num, parent):
|
||||||
# a = div.find('a', href=True)
|
f = feeds[num]
|
||||||
# if not a:
|
for j, a in enumerate(f):
|
||||||
# continue
|
if getattr(a, 'downloaded', False):
|
||||||
# url = re.sub(r'\?.*', '', a['href'])
|
adir = 'feed_%d/article_%d/'%(num, j)
|
||||||
# url += '?pagewanted=all'
|
auth = a.author
|
||||||
# title = self.tag_to_string(a, use_alt=True).strip()
|
if not auth:
|
||||||
# description = ''
|
auth = None
|
||||||
# pubdate = strftime('%a, %d %b')
|
desc = a.text_summary
|
||||||
# summary = div.find(True, attrs={'class':'summary'})
|
if not desc:
|
||||||
# if summary:
|
desc = None
|
||||||
# description = self.tag_to_string(summary, use_alt=False)
|
else:
|
||||||
#
|
desc = self.description_limiter(desc)
|
||||||
# feed = key if key is not None else 'Uncategorized'
|
tt = a.toc_thumbnail if a.toc_thumbnail else None
|
||||||
# if not articles.has_key(feed):
|
entries.append('%sindex.html'%adir)
|
||||||
# articles[feed] = []
|
po = self.play_order_map.get(entries[-1], None)
|
||||||
# if not 'podcasts' in url:
|
if po is None:
|
||||||
# articles[feed].append(
|
self.play_order_counter += 1
|
||||||
# dict(title=title, url=url, date=pubdate,
|
po = self.play_order_counter
|
||||||
# description=description,
|
parent.add_item('%sindex.html'%adir, None,
|
||||||
# content=''))
|
a.title if a.title else _('Untitled Article'),
|
||||||
# ans = self.sort_index_by(ans, {'The Front Page':-1, 'Dining In, Dining Out':1, 'Obituaries':2})
|
play_order=po, author=auth,
|
||||||
ans = [(unicode(key), articles[key]) for key in ans if articles.has_key(key)]
|
description=desc, toc_thumbnail=tt)
|
||||||
return ans
|
last = os.path.join(self.output_dir, ('%sindex.html'%adir).replace('/', os.sep))
|
||||||
|
for sp in a.sub_pages:
|
||||||
|
prefix = os.path.commonprefix([opf_path, sp])
|
||||||
|
relp = sp[len(prefix):]
|
||||||
|
entries.append(relp.replace(os.sep, '/'))
|
||||||
|
last = sp
|
||||||
|
|
||||||
|
if os.path.exists(last):
|
||||||
|
with open(last, 'rb') as fi:
|
||||||
|
src = fi.read().decode('utf-8')
|
||||||
|
soup = BeautifulSoup(src)
|
||||||
|
body = soup.find('body')
|
||||||
|
if body is not None:
|
||||||
|
prefix = '/'.join('..'for i in range(2*len(re.findall(r'link\d+', last))))
|
||||||
|
templ = self.navbar.generate(True, num, j, len(f),
|
||||||
|
not self.has_single_feed,
|
||||||
|
a.orig_url, __appname__, prefix=prefix,
|
||||||
|
center=self.center_navbar)
|
||||||
|
elem = BeautifulSoup(templ.render(doctype='xhtml').decode('utf-8')).find('div')
|
||||||
|
body.insert(len(body.contents), elem)
|
||||||
|
with open(last, 'wb') as fi:
|
||||||
|
fi.write(unicode(soup).encode('utf-8'))
|
||||||
|
if len(feeds) == 0:
|
||||||
|
raise Exception('All feeds are empty, aborting.')
|
||||||
|
|
||||||
|
if len(feeds) > 1:
|
||||||
|
for i, f in enumerate(feeds):
|
||||||
|
entries.append('feed_%d/index.html'%i)
|
||||||
|
po = self.play_order_map.get(entries[-1], None)
|
||||||
|
if po is None:
|
||||||
|
self.play_order_counter += 1
|
||||||
|
po = self.play_order_counter
|
||||||
|
auth = getattr(f, 'author', None)
|
||||||
|
if not auth:
|
||||||
|
auth = None
|
||||||
|
desc = getattr(f, 'description', None)
|
||||||
|
if not desc:
|
||||||
|
desc = None
|
||||||
|
feed_index(i, toc.add_item('feed_%d/index.html'%i, None,
|
||||||
|
f.title, play_order=po, description=desc, author=auth))
|
||||||
|
|
||||||
|
else:
|
||||||
|
entries.append('feed_%d/index.html'%0)
|
||||||
|
feed_index(0, toc)
|
||||||
|
|
||||||
|
for i, p in enumerate(entries):
|
||||||
|
entries[i] = os.path.join(dir, p.replace('/', os.sep))
|
||||||
|
opf.create_spine(entries)
|
||||||
|
opf.set_toc(toc)
|
||||||
|
|
||||||
|
with nested(open(opf_path, 'wb'), open(ncx_path, 'wb')) as (opf_file, ncx_file):
|
||||||
|
opf.render(opf_file, ncx_file)
|
||||||
|
|
||||||
|
@ -37,68 +37,15 @@ class BusinessWeek(BasicNewsRecipe):
|
|||||||
, 'language' : language
|
, 'language' : language
|
||||||
}
|
}
|
||||||
|
|
||||||
#remove_tags = [
|
|
||||||
#dict(attrs={'class':'inStory'})
|
|
||||||
#,dict(name=['meta','link','iframe','base','embed','object','table','th','tr','td'])
|
|
||||||
#,dict(attrs={'id':['inset','videoDisplay']})
|
|
||||||
#]
|
|
||||||
#keep_only_tags = [dict(name='div', attrs={'id':['story-body','storyBody']})]
|
|
||||||
remove_attributes = ['lang']
|
|
||||||
match_regexps = [r'http://www.businessweek.com/.*_page_[1-9].*']
|
|
||||||
|
|
||||||
|
|
||||||
feeds = [
|
feeds = [
|
||||||
(u'Top Stories', u'http://www.businessweek.com/topStories/rss/topStories.rss'),
|
(u'Top Stories', u'http://www.businessweek.com/feeds/most-popular.rss'),
|
||||||
(u'Top News' , u'http://www.businessweek.com/rss/bwdaily.rss' ),
|
|
||||||
(u'Asia', u'http://www.businessweek.com/rss/asia.rss'),
|
|
||||||
(u'Autos', u'http://www.businessweek.com/rss/autos/index.rss'),
|
|
||||||
(u'Classic Cars', u'http://rss.businessweek.com/bw_rss/classiccars'),
|
|
||||||
(u'Hybrids', u'http://rss.businessweek.com/bw_rss/hybrids'),
|
|
||||||
(u'Europe', u'http://www.businessweek.com/rss/europe.rss'),
|
|
||||||
(u'Auto Reviews', u'http://rss.businessweek.com/bw_rss/autoreviews'),
|
|
||||||
(u'Innovation & Design', u'http://www.businessweek.com/rss/innovate.rss'),
|
|
||||||
(u'Architecture', u'http://www.businessweek.com/rss/architecture.rss'),
|
|
||||||
(u'Brand Equity', u'http://www.businessweek.com/rss/brandequity.rss'),
|
|
||||||
(u'Auto Design', u'http://www.businessweek.com/rss/carbuff.rss'),
|
|
||||||
(u'Game Room', u'http://rss.businessweek.com/bw_rss/gameroom'),
|
|
||||||
(u'Technology', u'http://www.businessweek.com/rss/technology.rss'),
|
|
||||||
(u'Investing', u'http://rss.businessweek.com/bw_rss/investor'),
|
|
||||||
(u'Small Business', u'http://www.businessweek.com/rss/smallbiz.rss'),
|
|
||||||
(u'Careers', u'http://rss.businessweek.com/bw_rss/careers'),
|
|
||||||
(u'B-Schools', u'http://www.businessweek.com/rss/bschools.rss'),
|
|
||||||
(u'Magazine Selections', u'http://www.businessweek.com/rss/magazine.rss'),
|
|
||||||
(u'CEO Guide to Tech', u'http://www.businessweek.com/rss/ceo_guide_tech.rss'),
|
|
||||||
]
|
]
|
||||||
|
|
||||||
def get_article_url(self, article):
|
|
||||||
url = article.get('guid', None)
|
|
||||||
if 'podcasts' in url:
|
|
||||||
return None
|
|
||||||
if 'surveys' in url:
|
|
||||||
return None
|
|
||||||
if 'images' in url:
|
|
||||||
return None
|
|
||||||
if 'feedroom' in url:
|
|
||||||
return None
|
|
||||||
if '/magazine/toc/' in url:
|
|
||||||
return None
|
|
||||||
rurl, sep, rest = url.rpartition('?')
|
|
||||||
if rurl:
|
|
||||||
return rurl
|
|
||||||
return rest
|
|
||||||
|
|
||||||
def print_version(self, url):
|
def print_version(self, url):
|
||||||
if '/news/' in url or '/blog/ in url':
|
soup = self.index_to_soup(url)
|
||||||
return url
|
prntver = soup.find('li', attrs={'class':'print tracked'})
|
||||||
rurl = url.replace('http://www.businessweek.com/','http://www.businessweek.com/print/')
|
rurl = prntver.find('a', href=True)['href']
|
||||||
return rurl.replace('/investing/','/investor/')
|
return rurl
|
||||||
|
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
|
||||||
for item in soup.findAll(style=True):
|
|
||||||
del item['style']
|
|
||||||
for alink in soup.findAll('a'):
|
|
||||||
if alink.string is not None:
|
|
||||||
tstr = alink.string
|
|
||||||
alink.replaceWith(tstr)
|
|
||||||
return soup
|
|
||||||
|
|
||||||
|
@ -1,33 +1,23 @@
|
|||||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
# vim:fileencoding=UTF-8
|
||||||
|
from __future__ import unicode_literals
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
class AListApart (BasicNewsRecipe):
|
class AListApart (BasicNewsRecipe):
|
||||||
__author__ = u'Marc Busqué <marc@lamarciana.com>'
|
__author__ = 'Marc Busqué <marc@lamarciana.com>'
|
||||||
__url__ = 'http://www.lamarciana.com'
|
__url__ = 'http://www.lamarciana.com'
|
||||||
__version__ = '1.0'
|
__version__ = '2.0'
|
||||||
__license__ = 'GPL v3'
|
__license__ = 'GPL v3'
|
||||||
__copyright__ = u'2012, Marc Busqué <marc@lamarciana.com>'
|
__copyright__ = '2012, Marc Busqué <marc@lamarciana.com>'
|
||||||
title = u'A List Apart'
|
title = u'A List Apart'
|
||||||
description = u'A List Apart Magazine (ISSN: 1534-0295) explores the design, development, and meaning of web content, with a special focus on web standards and best practices.'
|
description = u'A List Apart Magazine (ISSN: 1534-0295) explores the design, development, and meaning of web content, with a special focus on web standards and best practices. This recipe retrieve articles and columns.'
|
||||||
language = 'en'
|
language = 'en'
|
||||||
tags = 'web development, software'
|
tags = 'web development, software'
|
||||||
oldest_article = 120
|
oldest_article = 120
|
||||||
remove_empty_feeds = True
|
remove_empty_feeds = True
|
||||||
no_stylesheets = True
|
|
||||||
encoding = 'utf8'
|
encoding = 'utf8'
|
||||||
cover_url = u'http://alistapart.com/pix/alalogo.gif'
|
cover_url = u'http://alistapart.com/pix/alalogo.gif'
|
||||||
keep_only_tags = [
|
extra_css = u'img {max-width: 100%; display: block; margin: auto;}'
|
||||||
dict(name='div', attrs={'id': 'content'})
|
|
||||||
]
|
|
||||||
remove_tags = [
|
|
||||||
dict(name='ul', attrs={'id': 'metastuff'}),
|
|
||||||
dict(name='div', attrs={'class': 'discuss'}),
|
|
||||||
dict(name='div', attrs={'class': 'discuss'}),
|
|
||||||
dict(name='div', attrs={'id': 'learnmore'}),
|
|
||||||
]
|
|
||||||
remove_attributes = ['border', 'cellspacing', 'align', 'cellpadding', 'colspan', 'valign', 'vspace', 'hspace', 'alt', 'width', 'height']
|
|
||||||
extra_css = u'img {max-width: 100%; display: block; margin: auto;} #authorbio img {float: left; margin-right: 2%;}'
|
|
||||||
|
|
||||||
feeds = [
|
feeds = [
|
||||||
(u'A List Apart', u'http://www.alistapart.com/site/rss'),
|
(u'A List Apart', u'http://feeds.feedburner.com/alistapart/abridged'),
|
||||||
]
|
]
|
||||||
|
@ -1,30 +1,30 @@
|
|||||||
|
# vim:fileencoding=UTF-8
|
||||||
|
from __future__ import unicode_literals
|
||||||
__license__ = 'GPL v3'
|
__license__ = 'GPL v3'
|
||||||
__copyright__ = '2011, Eddie Lau'
|
__copyright__ = '2011-2013, Eddie Lau'
|
||||||
|
|
||||||
# data source: normal, mobile
|
# data source: normal, mobile
|
||||||
__Source__ = 'mobile'
|
__Source__ = 'mobile'
|
||||||
# please replace the following "True" with "False". (Default: True)
|
# please replace the following "True" with "False". (Default: True)
|
||||||
__MakePeriodical__ = True
|
__MakePeriodical__ = True
|
||||||
# Turn below to True if your device supports display of CJK titles (Default: False)
|
# Turn below to True if your device supports display of CJK titles (Default: False)
|
||||||
__UseChineseTitle__ = False
|
__UseChineseTitle__ = True
|
||||||
# Set it to False if you want to skip images (Default: True)
|
# Set it to False if you want to skip images (Default: True)
|
||||||
__KeepImages__ = True
|
__KeepImages__ = True
|
||||||
# Set it to True if you want to include a summary in Kindle's article view (Default: False)
|
# Set it to True if you want to include a summary in Kindle's article view (Default: False)
|
||||||
__IncludeSummary__ = False
|
__IncludeSummary__ = True
|
||||||
# Set it to True if you want thumbnail images in Kindle's article view (Default: True)
|
# Set it to True if you want thumbnail images in Kindle's article view (Default: True)
|
||||||
__IncludeThumbnails__ = True
|
__IncludeThumbnails__ = True
|
||||||
|
|
||||||
|
|
||||||
'''
|
'''
|
||||||
Change Log:
|
Change Log:
|
||||||
|
2013/03/31 -- fix cover retrieval code and heading size, and remove in summary
|
||||||
2011/12/29 -- first version done
|
2011/12/29 -- first version done
|
||||||
TODO:
|
|
||||||
* use alternative source at http://m.singtao.com/index.php
|
|
||||||
'''
|
'''
|
||||||
|
|
||||||
from calibre.utils.date import now as nowf
|
from calibre.utils.date import now as nowf
|
||||||
import os, datetime, re
|
import os, datetime, re
|
||||||
from datetime import date
|
|
||||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||||
from contextlib import nested
|
from contextlib import nested
|
||||||
from calibre.ebooks.BeautifulSoup import BeautifulSoup
|
from calibre.ebooks.BeautifulSoup import BeautifulSoup
|
||||||
@ -41,7 +41,7 @@ class STHKRecipe(BasicNewsRecipe):
|
|||||||
title = 'Sing Tao Daily - Hong Kong'
|
title = 'Sing Tao Daily - Hong Kong'
|
||||||
description = 'Hong Kong Chinese Newspaper (http://singtao.com)'
|
description = 'Hong Kong Chinese Newspaper (http://singtao.com)'
|
||||||
category = 'Chinese, News, Hong Kong'
|
category = 'Chinese, News, Hong Kong'
|
||||||
extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px; max-height:90%;} td[class=caption] {font-size:50%;} td[class=bodyhead]{font-weight:bold; font-size:150%;} td[class=stmobheadline]{font-weight:bold; font-size:150%;}'
|
extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px; max-height:90%;} td[class=caption] {font-size:50%;} td[class=bodyhead]{font-weight:bold; font-size:150%;} td[class=stmobheadline]{font-weight:bold; font-size:200%;}'
|
||||||
masthead_url = 'http://upload.wikimedia.org/wikipedia/en/d/dd/Singtao-usa.png'
|
masthead_url = 'http://upload.wikimedia.org/wikipedia/en/d/dd/Singtao-usa.png'
|
||||||
if __Source__ == 'normal':
|
if __Source__ == 'normal':
|
||||||
keep_only_tags = [dict(name='td', attrs={'class':['bodyhead','bodytext']})]
|
keep_only_tags = [dict(name='td', attrs={'class':['bodyhead','bodytext']})]
|
||||||
@ -96,17 +96,13 @@ class STHKRecipe(BasicNewsRecipe):
|
|||||||
return self.get_dtlocal().strftime("%d")
|
return self.get_dtlocal().strftime("%d")
|
||||||
|
|
||||||
def get_cover_url(self):
|
def get_cover_url(self):
|
||||||
#cover = 'http://singtao.com/media/a/a(2660).jpg' # for 2011/12/29
|
soup = self.index_to_soup('http://m.singtao.com/')
|
||||||
base = 2660
|
cover = soup.find(attrs={'class':'special'}).get('src', False)
|
||||||
todaydate = date(int(self.get_fetchyear()), int(self.get_fetchmonth()), int(self.get_fetchday()))
|
|
||||||
diff = todaydate - date(2011, 12, 29)
|
|
||||||
base = base + int(diff.total_seconds()/(3600*24))
|
|
||||||
cover = 'http://singtao.com/media/a/a(' + str(base) +').jpg'
|
|
||||||
br = BasicNewsRecipe.get_browser(self)
|
br = BasicNewsRecipe.get_browser(self)
|
||||||
try:
|
try:
|
||||||
br.open(cover)
|
br.open(cover)
|
||||||
except:
|
except:
|
||||||
cover = 'http://singtao.com/images/stlogo.gif'
|
cover = None
|
||||||
return cover
|
return cover
|
||||||
|
|
||||||
def parse_index(self):
|
def parse_index(self):
|
||||||
@ -289,11 +285,11 @@ class STHKRecipe(BasicNewsRecipe):
|
|||||||
# the text may or may not be enclosed in <p></p> tag
|
# the text may or may not be enclosed in <p></p> tag
|
||||||
paras = articlebody.findAll('p')
|
paras = articlebody.findAll('p')
|
||||||
if not paras:
|
if not paras:
|
||||||
paras = articlebody
|
paras = articlebody
|
||||||
textFound = False
|
textFound = False
|
||||||
for p in paras:
|
for p in paras:
|
||||||
if not textFound:
|
if not textFound:
|
||||||
summary_candidate = self.tag_to_string(p).strip()
|
summary_candidate = self.tag_to_string(p).strip().replace(' ', '')
|
||||||
if len(summary_candidate) > 0:
|
if len(summary_candidate) > 0:
|
||||||
summary_candidate = summary_candidate.replace(u'(\u661f\u5cf6\u65e5\u5831\u5831\u9053)', '', 1)
|
summary_candidate = summary_candidate.replace(u'(\u661f\u5cf6\u65e5\u5831\u5831\u9053)', '', 1)
|
||||||
article.summary = article.text_summary = summary_candidate
|
article.summary = article.text_summary = summary_candidate
|
||||||
@ -489,3 +485,4 @@ class STHKRecipe(BasicNewsRecipe):
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -47,6 +47,10 @@ binary_includes = [
|
|||||||
'/usr/lib/libgthread-2.0.so.0',
|
'/usr/lib/libgthread-2.0.so.0',
|
||||||
'/usr/lib/libpng14.so.14',
|
'/usr/lib/libpng14.so.14',
|
||||||
'/usr/lib/libexslt.so.0',
|
'/usr/lib/libexslt.so.0',
|
||||||
|
# Ensure that libimobiledevice is compiled against openssl, not gnutls
|
||||||
|
'/usr/lib/libimobiledevice.so.3',
|
||||||
|
'/usr/lib/libusbmuxd.so.2',
|
||||||
|
'/usr/lib/libplist.so.1',
|
||||||
MAGICK_PREFIX+'/lib/libMagickWand.so.5',
|
MAGICK_PREFIX+'/lib/libMagickWand.so.5',
|
||||||
MAGICK_PREFIX+'/lib/libMagickCore.so.5',
|
MAGICK_PREFIX+'/lib/libMagickCore.so.5',
|
||||||
'/usr/lib/libgcrypt.so.11',
|
'/usr/lib/libgcrypt.so.11',
|
||||||
|
@ -399,7 +399,8 @@ class Py2App(object):
|
|||||||
@flush
|
@flush
|
||||||
def add_fontconfig(self):
|
def add_fontconfig(self):
|
||||||
info('\nAdding fontconfig')
|
info('\nAdding fontconfig')
|
||||||
for x in ('fontconfig.1', 'freetype.6', 'expat.1'):
|
for x in ('fontconfig.1', 'freetype.6', 'expat.1',
|
||||||
|
'plist.1', 'usbmuxd.2', 'imobiledevice.3'):
|
||||||
src = os.path.join(SW, 'lib', 'lib'+x+'.dylib')
|
src = os.path.join(SW, 'lib', 'lib'+x+'.dylib')
|
||||||
self.install_dylib(src)
|
self.install_dylib(src)
|
||||||
dst = os.path.join(self.resources_dir, 'fonts')
|
dst = os.path.join(self.resources_dir, 'fonts')
|
||||||
|
@ -262,6 +262,35 @@ def from_links(container):
|
|||||||
toc.remove(child)
|
toc.remove(child)
|
||||||
return toc
|
return toc
|
||||||
|
|
||||||
|
def find_text(node):
|
||||||
|
LIMIT = 200
|
||||||
|
pat = re.compile(r'\s+')
|
||||||
|
for child in node:
|
||||||
|
if isinstance(child, etree._Element):
|
||||||
|
text = xml2text(child).strip()
|
||||||
|
text = pat.sub(' ', text)
|
||||||
|
if len(text) < 1:
|
||||||
|
continue
|
||||||
|
if len(text) > LIMIT:
|
||||||
|
# Look for less text in a child of this node, recursively
|
||||||
|
ntext = find_text(child)
|
||||||
|
return ntext or (text[:LIMIT] + '...')
|
||||||
|
else:
|
||||||
|
return text
|
||||||
|
|
||||||
|
def from_files(container):
|
||||||
|
toc = TOC()
|
||||||
|
for spinepath in container.spine_items:
|
||||||
|
name = container.abspath_to_name(spinepath)
|
||||||
|
root = container.parsed(name)
|
||||||
|
body = XPath('//h:body')(root)
|
||||||
|
if not body:
|
||||||
|
continue
|
||||||
|
text = find_text(body[0])
|
||||||
|
if text:
|
||||||
|
toc.add(text, name)
|
||||||
|
return toc
|
||||||
|
|
||||||
def add_id(container, name, loc):
|
def add_id(container, name, loc):
|
||||||
root = container.parsed(name)
|
root = container.parsed(name)
|
||||||
body = root.xpath('//*[local-name()="body"]')[0]
|
body = root.xpath('//*[local-name()="body"]')[0]
|
||||||
|
@ -333,8 +333,8 @@ class OEBReader(object):
|
|||||||
guide = self.oeb.guide
|
guide = self.oeb.guide
|
||||||
manifest = self.oeb.manifest
|
manifest = self.oeb.manifest
|
||||||
for elem in xpath(opf, '/o2:package/o2:guide/o2:reference'):
|
for elem in xpath(opf, '/o2:package/o2:guide/o2:reference'):
|
||||||
href = elem.get('href')
|
ref_href = elem.get('href')
|
||||||
path = urlnormalize(urldefrag(href)[0])
|
path = urlnormalize(urldefrag(ref_href)[0])
|
||||||
if path not in manifest.hrefs:
|
if path not in manifest.hrefs:
|
||||||
corrected_href = None
|
corrected_href = None
|
||||||
for href in manifest.hrefs:
|
for href in manifest.hrefs:
|
||||||
@ -342,12 +342,12 @@ class OEBReader(object):
|
|||||||
corrected_href = href
|
corrected_href = href
|
||||||
break
|
break
|
||||||
if corrected_href is None:
|
if corrected_href is None:
|
||||||
self.logger.warn(u'Guide reference %r not found' % href)
|
self.logger.warn(u'Guide reference %r not found' % ref_href)
|
||||||
continue
|
continue
|
||||||
href = corrected_href
|
ref_href = corrected_href
|
||||||
typ = elem.get('type')
|
typ = elem.get('type')
|
||||||
if typ not in guide:
|
if typ not in guide:
|
||||||
guide.add(typ, elem.get('title'), href)
|
guide.add(typ, elem.get('title'), ref_href)
|
||||||
|
|
||||||
def _find_ncx(self, opf):
|
def _find_ncx(self, opf):
|
||||||
result = xpath(opf, '/o2:package/o2:spine/@toc')
|
result = xpath(opf, '/o2:package/o2:spine/@toc')
|
||||||
|
@ -18,7 +18,7 @@ from PyQt4.Qt import (QPushButton, QFrame, QVariant, QMenu, QInputDialog,
|
|||||||
|
|
||||||
from calibre.ebooks.oeb.polish.container import get_container, AZW3Container
|
from calibre.ebooks.oeb.polish.container import get_container, AZW3Container
|
||||||
from calibre.ebooks.oeb.polish.toc import (
|
from calibre.ebooks.oeb.polish.toc import (
|
||||||
get_toc, add_id, TOC, commit_toc, from_xpaths, from_links)
|
get_toc, add_id, TOC, commit_toc, from_xpaths, from_links, from_files)
|
||||||
from calibre.gui2 import Application, error_dialog, gprefs
|
from calibre.gui2 import Application, error_dialog, gprefs
|
||||||
from calibre.gui2.progress_indicator import ProgressIndicator
|
from calibre.gui2.progress_indicator import ProgressIndicator
|
||||||
from calibre.gui2.toc.location import ItemEdit
|
from calibre.gui2.toc.location import ItemEdit
|
||||||
@ -126,6 +126,7 @@ class ItemView(QFrame): # {{{
|
|||||||
go_to_root = pyqtSignal()
|
go_to_root = pyqtSignal()
|
||||||
create_from_xpath = pyqtSignal(object)
|
create_from_xpath = pyqtSignal(object)
|
||||||
create_from_links = pyqtSignal()
|
create_from_links = pyqtSignal()
|
||||||
|
create_from_files = pyqtSignal()
|
||||||
flatten_toc = pyqtSignal()
|
flatten_toc = pyqtSignal()
|
||||||
|
|
||||||
def __init__(self, parent):
|
def __init__(self, parent):
|
||||||
@ -183,6 +184,15 @@ class ItemView(QFrame): # {{{
|
|||||||
)))
|
)))
|
||||||
l.addWidget(b)
|
l.addWidget(b)
|
||||||
|
|
||||||
|
self.cfb = b = QPushButton(_('Generate ToC from &files'))
|
||||||
|
b.clicked.connect(self.create_from_files)
|
||||||
|
b.setToolTip(textwrap.fill(_(
|
||||||
|
'Generate a Table of Contents from individual files in the book.'
|
||||||
|
' Each entry in the ToC will point to the start of the file, the'
|
||||||
|
' text of the entry will be the "first line" of text from the file.'
|
||||||
|
)))
|
||||||
|
l.addWidget(b)
|
||||||
|
|
||||||
self.xpb = b = QPushButton(_('Generate ToC from &XPath'))
|
self.xpb = b = QPushButton(_('Generate ToC from &XPath'))
|
||||||
b.clicked.connect(self.create_from_user_xpath)
|
b.clicked.connect(self.create_from_user_xpath)
|
||||||
b.setToolTip(textwrap.fill(_(
|
b.setToolTip(textwrap.fill(_(
|
||||||
@ -577,6 +587,7 @@ class TOCView(QWidget): # {{{
|
|||||||
i.add_new_item.connect(self.add_new_item)
|
i.add_new_item.connect(self.add_new_item)
|
||||||
i.create_from_xpath.connect(self.create_from_xpath)
|
i.create_from_xpath.connect(self.create_from_xpath)
|
||||||
i.create_from_links.connect(self.create_from_links)
|
i.create_from_links.connect(self.create_from_links)
|
||||||
|
i.create_from_files.connect(self.create_from_files)
|
||||||
i.flatten_item.connect(self.flatten_item)
|
i.flatten_item.connect(self.flatten_item)
|
||||||
i.flatten_toc.connect(self.flatten_toc)
|
i.flatten_toc.connect(self.flatten_toc)
|
||||||
i.go_to_root.connect(self.go_to_root)
|
i.go_to_root.connect(self.go_to_root)
|
||||||
@ -778,6 +789,14 @@ class TOCView(QWidget): # {{{
|
|||||||
_('No links were found that could be added to the Table of Contents.'), show=True)
|
_('No links were found that could be added to the Table of Contents.'), show=True)
|
||||||
self.insert_toc_fragment(toc)
|
self.insert_toc_fragment(toc)
|
||||||
|
|
||||||
|
def create_from_files(self):
|
||||||
|
toc = from_files(self.ebook)
|
||||||
|
if len(toc) == 0:
|
||||||
|
return error_dialog(self, _('No items found'),
|
||||||
|
_('No files were found that could be added to the Table of Contents.'), show=True)
|
||||||
|
self.insert_toc_fragment(toc)
|
||||||
|
|
||||||
|
|
||||||
# }}}
|
# }}}
|
||||||
|
|
||||||
class TOCEditor(QDialog): # {{{
|
class TOCEditor(QDialog): # {{{
|
||||||
|
@ -22507,7 +22507,7 @@ msgstr "Autoren beginnend mit '%s'"
|
|||||||
#: /home/kovid/work/calibre/src/calibre/library/catalogs/epub_mobi_builder.py:3477
|
#: /home/kovid/work/calibre/src/calibre/library/catalogs/epub_mobi_builder.py:3477
|
||||||
#, python-format
|
#, python-format
|
||||||
msgid "Authors beginning with '%s'"
|
msgid "Authors beginning with '%s'"
|
||||||
msgstr "Autoren beginnen mit mit %s"
|
msgstr "Autoren beginnen mit %s"
|
||||||
|
|
||||||
#: /home/kovid/work/calibre/src/calibre/library/catalogs/epub_mobi_builder.py:3518
|
#: /home/kovid/work/calibre/src/calibre/library/catalogs/epub_mobi_builder.py:3518
|
||||||
msgid "NCX for Recently Added"
|
msgid "NCX for Recently Added"
|
||||||
|
Loading…
x
Reference in New Issue
Block a user