mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
sync with Kovid's branch
This commit is contained in:
commit
f6fee32395
@ -20,6 +20,58 @@
|
||||
# new recipes:
|
||||
# - title:
|
||||
|
||||
- version: 0.9.26
|
||||
date: 2013-04-05
|
||||
|
||||
new features:
|
||||
- title: "PDF Output: Allow using templates to create arbitrary headers and footers. Look under PDF Output in the conversion dialog for this feature."
|
||||
|
||||
- title: "ToC Editor: Allow generating the ToC directly from individual files inside the ebook. Useful for EPUBs that have individual chapters in single files."
|
||||
tickets: [1163520]
|
||||
|
||||
- title: "ToC Editor: Add buttons to indent/unindent the current entry"
|
||||
|
||||
- title: "ToC Editor: Right-click menu to perform various useful actions on entries in the ToC"
|
||||
|
||||
- title: "Column icons: Allow use of wide images as column icons"
|
||||
|
||||
- title: "Add USB ids for the Palm Pre2 and Samsung Galaxy phone to the device drivers"
|
||||
tickets: [1162293,1163115]
|
||||
|
||||
bug fixes:
|
||||
- title: "PDF Output: Fix generating page numbers causing links to not work."
|
||||
tickets: [1162573]
|
||||
|
||||
- title: "Wrong filename output in error message when 'Guide reference not found'"
|
||||
tickets: [1163659]
|
||||
|
||||
- title: "Get Books: Update Amazon, Barnes & Noble, Waterstones and Gutenberg store plugins for website change"
|
||||
|
||||
- title: "PDF Output: Fix 1 pixel wide left and top margins on the cover page for some PDF conversions due to incorrect rounding."
|
||||
tickets: [1162054]
|
||||
|
||||
- title: "ToC Editor: Fix drag and drop of multiple items resulting in the dropped items being in random order sometimes."
|
||||
tickets: [1161999]
|
||||
|
||||
improved recipes:
|
||||
- Financial Times UK
|
||||
- Sing Tao Daily
|
||||
- Apple Daily
|
||||
- A List Apart
|
||||
- Business Week
|
||||
- Harpers printed edition
|
||||
- Harvard Business Review
|
||||
|
||||
new recipes:
|
||||
- title: AM730
|
||||
author: Eddie Lau
|
||||
|
||||
- title: Arret sur images
|
||||
author: Francois D
|
||||
|
||||
- title: Diario de Noticias
|
||||
author: Jose Pinto
|
||||
|
||||
- version: 0.9.25
|
||||
date: 2013-03-29
|
||||
|
||||
|
290
recipes/am730.recipe
Normal file
290
recipes/am730.recipe
Normal file
@ -0,0 +1,290 @@
|
||||
# vim:fileencoding=UTF-8
|
||||
from __future__ import unicode_literals
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2013, Eddie Lau'
|
||||
__Date__ = ''
|
||||
__HiResImg__ = True
|
||||
|
||||
'''
|
||||
Change Log:
|
||||
2013/03/30 -- first version
|
||||
'''
|
||||
|
||||
from calibre import (__appname__, force_unicode, strftime)
|
||||
from calibre.utils.date import now as nowf
|
||||
import os, datetime, re
|
||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||
from contextlib import nested
|
||||
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag
|
||||
from calibre.ebooks.metadata.opf2 import OPFCreator
|
||||
from calibre.ebooks.metadata.toc import TOC
|
||||
from calibre.ebooks.metadata import MetaInformation
|
||||
from calibre.utils.localization import canonicalize_lang
|
||||
|
||||
class AppleDaily(BasicNewsRecipe):
|
||||
title = u'AM730'
|
||||
__author__ = 'Eddie Lau'
|
||||
publisher = 'AM730'
|
||||
oldest_article = 1
|
||||
max_articles_per_feed = 100
|
||||
auto_cleanup = False
|
||||
language = 'zh'
|
||||
encoding = 'utf-8'
|
||||
auto_cleanup = False
|
||||
remove_javascript = True
|
||||
use_embedded_content = False
|
||||
no_stylesheets = True
|
||||
description = 'http://www.am730.com.hk'
|
||||
category = 'Chinese, News, Hong Kong'
|
||||
masthead_url = 'http://www.am730.com.hk/images/logo.jpg'
|
||||
|
||||
extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px; max-height:90%;} div[id=articleHeader] {font-size:200%; text-align:left; font-weight:bold;} photocaption {font-size:50%; margin-left:auto; margin-right:auto;}'
|
||||
keep_only_tags = [dict(name='div', attrs={'id':'articleHeader'}),
|
||||
dict(name='div', attrs={'class':'thecontent wordsnap'}),
|
||||
dict(name='a', attrs={'class':'lightboximg'})]
|
||||
remove_tags = [dict(name='img', attrs={'src':'/images/am730_article_logo.jpg'}),
|
||||
dict(name='img', attrs={'src':'/images/am_endmark.gif'})]
|
||||
|
||||
def get_dtlocal(self):
|
||||
dt_utc = datetime.datetime.utcnow()
|
||||
# convert UTC to local hk time - at HKT 6am, all news are available
|
||||
return dt_utc + datetime.timedelta(8.0/24) - datetime.timedelta(6.0/24)
|
||||
|
||||
def get_fetchdate(self):
|
||||
if __Date__ <> '':
|
||||
return __Date__
|
||||
else:
|
||||
return self.get_dtlocal().strftime("%Y%m%d")
|
||||
|
||||
def get_fetchformatteddate(self):
|
||||
if __Date__ <> '':
|
||||
return __Date__[0:4]+'-'+__Date__[4:6]+'-'+__Date__[6:8]
|
||||
else:
|
||||
return self.get_dtlocal().strftime("%Y-%m-%d")
|
||||
|
||||
def get_fetchyear(self):
|
||||
if __Date__ <> '':
|
||||
return __Date__[0:4]
|
||||
else:
|
||||
return self.get_dtlocal().strftime("%Y")
|
||||
|
||||
def get_fetchmonth(self):
|
||||
if __Date__ <> '':
|
||||
return __Date__[4:6]
|
||||
else:
|
||||
return self.get_dtlocal().strftime("%m")
|
||||
|
||||
def get_fetchday(self):
|
||||
if __Date__ <> '':
|
||||
return __Date__[6:8]
|
||||
else:
|
||||
return self.get_dtlocal().strftime("%d")
|
||||
|
||||
# Note: does not work with custom date given by __Date__
|
||||
def get_weekday(self):
|
||||
return self.get_dtlocal().weekday()
|
||||
|
||||
def populate_article_metadata(self, article, soup, first):
|
||||
if first and hasattr(self, 'add_toc_thumbnail'):
|
||||
picdiv = soup.find('img')
|
||||
if picdiv is not None:
|
||||
self.add_toc_thumbnail(article,picdiv['src'])
|
||||
|
||||
def parse_index(self):
|
||||
feeds = []
|
||||
soup = self.index_to_soup('http://www.am730.com.hk/')
|
||||
ul = soup.find(attrs={'class':'nav-section'})
|
||||
sectionList = []
|
||||
for li in ul.findAll('li'):
|
||||
a = 'http://www.am730.com.hk/' + li.find('a', href=True).get('href', False)
|
||||
title = li.find('a').get('title', False).strip()
|
||||
sectionList.append((title, a))
|
||||
for title, url in sectionList:
|
||||
articles = self.parse_section(url)
|
||||
if articles:
|
||||
feeds.append((title, articles))
|
||||
return feeds
|
||||
|
||||
def parse_section(self, url):
|
||||
soup = self.index_to_soup(url)
|
||||
items = soup.findAll(attrs={'style':'padding-bottom: 15px;'})
|
||||
current_articles = []
|
||||
for item in items:
|
||||
a = item.find(attrs={'class':'t6 f14'}).find('a', href=True)
|
||||
articlelink = 'http://www.am730.com.hk/' + a.get('href', True)
|
||||
title = self.tag_to_string(a)
|
||||
description = self.tag_to_string(item.find(attrs={'class':'t3 f14'}))
|
||||
current_articles.append({'title': title, 'url': articlelink, 'description': description})
|
||||
return current_articles
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
multia = soup.findAll('a')
|
||||
for a in multia:
|
||||
if not (a == None):
|
||||
image = a.find('img')
|
||||
if not (image == None):
|
||||
if __HiResImg__:
|
||||
image['src'] = image.get('src').replace('/thumbs/', '/')
|
||||
caption = image.get('alt')
|
||||
tag = Tag(soup, "photo", [])
|
||||
tag2 = Tag(soup, "photocaption", [])
|
||||
tag.insert(0, image)
|
||||
if not caption == None:
|
||||
tag2.insert(0, caption)
|
||||
tag.insert(1, tag2)
|
||||
a.replaceWith(tag)
|
||||
return soup
|
||||
|
||||
def create_opf(self, feeds, dir=None):
|
||||
if dir is None:
|
||||
dir = self.output_dir
|
||||
title = self.short_title()
|
||||
if self.output_profile.periodical_date_in_title:
|
||||
title += strftime(self.timefmt)
|
||||
mi = MetaInformation(title, [__appname__])
|
||||
mi.publisher = __appname__
|
||||
mi.author_sort = __appname__
|
||||
if self.publication_type:
|
||||
mi.publication_type = 'periodical:'+self.publication_type+':'+self.short_title()
|
||||
mi.timestamp = nowf()
|
||||
article_titles, aseen = [], set()
|
||||
for f in feeds:
|
||||
for a in f:
|
||||
if a.title and a.title not in aseen:
|
||||
aseen.add(a.title)
|
||||
article_titles.append(force_unicode(a.title, 'utf-8'))
|
||||
|
||||
mi.comments = self.description
|
||||
if not isinstance(mi.comments, unicode):
|
||||
mi.comments = mi.comments.decode('utf-8', 'replace')
|
||||
mi.comments += ('\n\n' + _('Articles in this issue: ') + '\n' +
|
||||
'\n\n'.join(article_titles))
|
||||
|
||||
language = canonicalize_lang(self.language)
|
||||
if language is not None:
|
||||
mi.language = language
|
||||
# This one affects the pub date shown in kindle title
|
||||
#mi.pubdate = nowf()
|
||||
# now appears to need the time field to be > 12.00noon as well
|
||||
mi.pubdate = datetime.datetime(int(self.get_fetchyear()), int(self.get_fetchmonth()), int(self.get_fetchday()), 12, 30, 0)
|
||||
opf_path = os.path.join(dir, 'index.opf')
|
||||
ncx_path = os.path.join(dir, 'index.ncx')
|
||||
|
||||
opf = OPFCreator(dir, mi)
|
||||
# Add mastheadImage entry to <guide> section
|
||||
mp = getattr(self, 'masthead_path', None)
|
||||
if mp is not None and os.access(mp, os.R_OK):
|
||||
from calibre.ebooks.metadata.opf2 import Guide
|
||||
ref = Guide.Reference(os.path.basename(self.masthead_path), os.getcwdu())
|
||||
ref.type = 'masthead'
|
||||
ref.title = 'Masthead Image'
|
||||
opf.guide.append(ref)
|
||||
|
||||
manifest = [os.path.join(dir, 'feed_%d'%i) for i in range(len(feeds))]
|
||||
manifest.append(os.path.join(dir, 'index.html'))
|
||||
manifest.append(os.path.join(dir, 'index.ncx'))
|
||||
|
||||
# Get cover
|
||||
cpath = getattr(self, 'cover_path', None)
|
||||
if cpath is None:
|
||||
pf = open(os.path.join(dir, 'cover.jpg'), 'wb')
|
||||
if self.default_cover(pf):
|
||||
cpath = pf.name
|
||||
if cpath is not None and os.access(cpath, os.R_OK):
|
||||
opf.cover = cpath
|
||||
manifest.append(cpath)
|
||||
|
||||
# Get masthead
|
||||
mpath = getattr(self, 'masthead_path', None)
|
||||
if mpath is not None and os.access(mpath, os.R_OK):
|
||||
manifest.append(mpath)
|
||||
|
||||
opf.create_manifest_from_files_in(manifest)
|
||||
for mani in opf.manifest:
|
||||
if mani.path.endswith('.ncx'):
|
||||
mani.id = 'ncx'
|
||||
if mani.path.endswith('mastheadImage.jpg'):
|
||||
mani.id = 'masthead-image'
|
||||
|
||||
entries = ['index.html']
|
||||
toc = TOC(base_path=dir)
|
||||
self.play_order_counter = 0
|
||||
self.play_order_map = {}
|
||||
|
||||
|
||||
def feed_index(num, parent):
|
||||
f = feeds[num]
|
||||
for j, a in enumerate(f):
|
||||
if getattr(a, 'downloaded', False):
|
||||
adir = 'feed_%d/article_%d/'%(num, j)
|
||||
auth = a.author
|
||||
if not auth:
|
||||
auth = None
|
||||
desc = a.text_summary
|
||||
if not desc:
|
||||
desc = None
|
||||
else:
|
||||
desc = self.description_limiter(desc)
|
||||
tt = a.toc_thumbnail if a.toc_thumbnail else None
|
||||
entries.append('%sindex.html'%adir)
|
||||
po = self.play_order_map.get(entries[-1], None)
|
||||
if po is None:
|
||||
self.play_order_counter += 1
|
||||
po = self.play_order_counter
|
||||
parent.add_item('%sindex.html'%adir, None,
|
||||
a.title if a.title else _('Untitled Article'),
|
||||
play_order=po, author=auth,
|
||||
description=desc, toc_thumbnail=tt)
|
||||
last = os.path.join(self.output_dir, ('%sindex.html'%adir).replace('/', os.sep))
|
||||
for sp in a.sub_pages:
|
||||
prefix = os.path.commonprefix([opf_path, sp])
|
||||
relp = sp[len(prefix):]
|
||||
entries.append(relp.replace(os.sep, '/'))
|
||||
last = sp
|
||||
|
||||
if os.path.exists(last):
|
||||
with open(last, 'rb') as fi:
|
||||
src = fi.read().decode('utf-8')
|
||||
soup = BeautifulSoup(src)
|
||||
body = soup.find('body')
|
||||
if body is not None:
|
||||
prefix = '/'.join('..'for i in range(2*len(re.findall(r'link\d+', last))))
|
||||
templ = self.navbar.generate(True, num, j, len(f),
|
||||
not self.has_single_feed,
|
||||
a.orig_url, __appname__, prefix=prefix,
|
||||
center=self.center_navbar)
|
||||
elem = BeautifulSoup(templ.render(doctype='xhtml').decode('utf-8')).find('div')
|
||||
body.insert(len(body.contents), elem)
|
||||
with open(last, 'wb') as fi:
|
||||
fi.write(unicode(soup).encode('utf-8'))
|
||||
if len(feeds) == 0:
|
||||
raise Exception('All feeds are empty, aborting.')
|
||||
|
||||
if len(feeds) > 1:
|
||||
for i, f in enumerate(feeds):
|
||||
entries.append('feed_%d/index.html'%i)
|
||||
po = self.play_order_map.get(entries[-1], None)
|
||||
if po is None:
|
||||
self.play_order_counter += 1
|
||||
po = self.play_order_counter
|
||||
auth = getattr(f, 'author', None)
|
||||
if not auth:
|
||||
auth = None
|
||||
desc = getattr(f, 'description', None)
|
||||
if not desc:
|
||||
desc = None
|
||||
feed_index(i, toc.add_item('feed_%d/index.html'%i, None,
|
||||
f.title, play_order=po, description=desc, author=auth))
|
||||
|
||||
else:
|
||||
entries.append('feed_%d/index.html'%0)
|
||||
feed_index(0, toc)
|
||||
|
||||
for i, p in enumerate(entries):
|
||||
entries[i] = os.path.join(dir, p.replace('/', os.sep))
|
||||
opf.create_spine(entries)
|
||||
opf.set_toc(toc)
|
||||
|
||||
with nested(open(opf_path, 'wb'), open(ncx_path, 'wb')) as (opf_file, ncx_file):
|
||||
opf.render(opf_file, ncx_file)
|
||||
|
@ -1,161 +1,275 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
import re
|
||||
# vim:fileencoding=UTF-8
|
||||
from __future__ import unicode_literals
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2013, Eddie Lau'
|
||||
__Date__ = ''
|
||||
|
||||
from calibre import (__appname__, force_unicode, strftime)
|
||||
from calibre.utils.date import now as nowf
|
||||
import os, datetime, re
|
||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||
from contextlib import nested
|
||||
from calibre.ebooks.BeautifulSoup import BeautifulSoup
|
||||
from calibre.ebooks.metadata.opf2 import OPFCreator
|
||||
from calibre.ebooks.metadata.toc import TOC
|
||||
from calibre.ebooks.metadata import MetaInformation
|
||||
from calibre.utils.localization import canonicalize_lang
|
||||
|
||||
class AppleDaily(BasicNewsRecipe):
|
||||
|
||||
title = u'蘋果日報'
|
||||
__author__ = u'蘋果日報'
|
||||
__publisher__ = u'蘋果日報'
|
||||
description = u'蘋果日報'
|
||||
masthead_url = 'http://hk.apple.nextmedia.com/template/common/header/2009/images/atnextheader_logo_appledaily.gif'
|
||||
language = 'zh_TW'
|
||||
encoding = 'UTF-8'
|
||||
timefmt = ' [%a, %d %b, %Y]'
|
||||
needs_subscription = False
|
||||
title = u'蘋果日報 (香港)'
|
||||
__author__ = 'Eddie Lau'
|
||||
publisher = '蘋果日報'
|
||||
oldest_article = 1
|
||||
max_articles_per_feed = 100
|
||||
auto_cleanup = False
|
||||
language = 'zh'
|
||||
encoding = 'utf-8'
|
||||
auto_cleanup = False
|
||||
remove_javascript = True
|
||||
remove_tags_before = dict(name=['ul', 'h1'])
|
||||
remove_tags_after = dict(name='form')
|
||||
remove_tags = [dict(attrs={'class':['articleTools', 'post-tools', 'side_tool', 'nextArticleLink clearfix']}),
|
||||
dict(id=['footer', 'toolsRight', 'articleInline', 'navigation', 'archive', 'side_search', 'blog_sidebar', 'side_tool', 'side_index']),
|
||||
dict(name=['script', 'noscript', 'style', 'form'])]
|
||||
use_embedded_content = False
|
||||
no_stylesheets = True
|
||||
extra_css = '''
|
||||
@font-face {font-family: "uming", serif, sans-serif; src: url(res:///usr/share/fonts/truetype/arphic/uming.ttc); }\n
|
||||
body {margin-right: 8pt; font-family: 'uming', serif;}
|
||||
h1 {font-family: 'uming', serif, sans-serif}
|
||||
'''
|
||||
#extra_css = 'h1 {font: sans-serif large;}\n.byline {font:monospace;}'
|
||||
description = 'http://hkm.appledaily.com/'
|
||||
category = 'Chinese, News, Hong Kong'
|
||||
masthead_url = 'http://upload.wikimedia.org/wikipedia/zh/c/cf/AppleDailyLogo1.png'
|
||||
|
||||
preprocess_regexps = [
|
||||
(re.compile(r'img.php?server=(?P<server>[^&]+)&path=(?P<path>[^&]+).*', re.DOTALL|re.IGNORECASE),
|
||||
lambda match: 'http://' + match.group('server') + '/' + match.group('path')),
|
||||
]
|
||||
extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px; max-height:90%;} h1 {font-size:200%; text-align:left; font-weight:bold;} p[class=video-caption] {font-size:50%; margin-left:auto; margin-right:auto;}'
|
||||
keep_only_tags = [dict(name='div', attrs={'id':'content-article'})]
|
||||
remove_tags = [dict(name='div', attrs={'class':'prev-next-btn'}),
|
||||
dict(name='p', attrs={'class':'next'})]
|
||||
|
||||
def get_dtlocal(self):
|
||||
dt_utc = datetime.datetime.utcnow()
|
||||
# convert UTC to local hk time - at HKT 6am, all news are available
|
||||
return dt_utc + datetime.timedelta(8.0/24) - datetime.timedelta(6.0/24)
|
||||
|
||||
def get_fetchdate(self):
|
||||
if __Date__ <> '':
|
||||
return __Date__
|
||||
else:
|
||||
return self.get_dtlocal().strftime("%Y%m%d")
|
||||
|
||||
def get_fetchformatteddate(self):
|
||||
if __Date__ <> '':
|
||||
return __Date__[0:4]+'-'+__Date__[4:6]+'-'+__Date__[6:8]
|
||||
else:
|
||||
return self.get_dtlocal().strftime("%Y-%m-%d")
|
||||
|
||||
def get_fetchyear(self):
|
||||
if __Date__ <> '':
|
||||
return __Date__[0:4]
|
||||
else:
|
||||
return self.get_dtlocal().strftime("%Y")
|
||||
|
||||
def get_fetchmonth(self):
|
||||
if __Date__ <> '':
|
||||
return __Date__[4:6]
|
||||
else:
|
||||
return self.get_dtlocal().strftime("%m")
|
||||
|
||||
def get_fetchday(self):
|
||||
if __Date__ <> '':
|
||||
return __Date__[6:8]
|
||||
else:
|
||||
return self.get_dtlocal().strftime("%d")
|
||||
|
||||
# Note: does not work with custom date given by __Date__
|
||||
def get_weekday(self):
|
||||
return self.get_dtlocal().weekday()
|
||||
|
||||
def get_cover_url(self):
|
||||
return 'http://hk.apple.nextmedia.com/template/common/header/2009/images/atnextheader_logo_appledaily.gif'
|
||||
|
||||
|
||||
#def get_browser(self):
|
||||
#br = BasicNewsRecipe.get_browser(self)
|
||||
#if self.username is not None and self.password is not None:
|
||||
# br.open('http://www.nytimes.com/auth/login')
|
||||
# br.select_form(name='login')
|
||||
# br['USERID'] = self.username
|
||||
# br['PASSWORD'] = self.password
|
||||
# br.submit()
|
||||
#return br
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
#process all the images
|
||||
for tag in soup.findAll(lambda tag: tag.name.lower()=='img' and tag.has_key('src')):
|
||||
iurl = tag['src']
|
||||
#print 'checking image: ' + iurl
|
||||
|
||||
#img\.php?server\=(?P<server>[^&]+)&path=(?P<path>[^&]+)
|
||||
p = re.compile(r'img\.php\?server=(?P<server>[^&]+)&path=(?P<path>[^&]+)', re.DOTALL|re.IGNORECASE)
|
||||
|
||||
m = p.search(iurl)
|
||||
|
||||
if m is not None:
|
||||
iurl = 'http://' + m.group('server') + '/' + m.group('path')
|
||||
#print 'working! new url: ' + iurl
|
||||
tag['src'] = iurl
|
||||
#else:
|
||||
#print 'not good'
|
||||
|
||||
for tag in soup.findAll(lambda tag: tag.name.lower()=='a' and tag.has_key('href')):
|
||||
iurl = tag['href']
|
||||
#print 'checking image: ' + iurl
|
||||
|
||||
#img\.php?server\=(?P<server>[^&]+)&path=(?P<path>[^&]+)
|
||||
p = re.compile(r'img\.php\?server=(?P<server>[^&]+)&path=(?P<path>[^&]+)', re.DOTALL|re.IGNORECASE)
|
||||
|
||||
m = p.search(iurl)
|
||||
|
||||
if m is not None:
|
||||
iurl = 'http://' + m.group('server') + '/' + m.group('path')
|
||||
#print 'working! new url: ' + iurl
|
||||
tag['href'] = iurl
|
||||
#else:
|
||||
#print 'not good'
|
||||
|
||||
return soup
|
||||
soup = self.index_to_soup('http://hkm.appledaily.com/')
|
||||
cover = soup.find(attrs={'class':'top-news'}).get('src', False)
|
||||
br = BasicNewsRecipe.get_browser(self)
|
||||
try:
|
||||
br.open(cover)
|
||||
except:
|
||||
cover = None
|
||||
return cover
|
||||
|
||||
def populate_article_metadata(self, article, soup, first):
|
||||
if first and hasattr(self, 'add_toc_thumbnail'):
|
||||
picdiv = soup.find('img')
|
||||
if picdiv is not None:
|
||||
self.add_toc_thumbnail(article,picdiv['src'])
|
||||
|
||||
def parse_index(self):
|
||||
base = 'http://news.hotpot.hk/fruit'
|
||||
soup = self.index_to_soup('http://news.hotpot.hk/fruit/index.php')
|
||||
feeds = []
|
||||
soup = self.index_to_soup('http://hkm.appledaily.com/')
|
||||
ul = soup.find(attrs={'class':'menu'})
|
||||
sectionList = []
|
||||
for li in ul.findAll('li'):
|
||||
a = 'http://hkm.appledaily.com/' + li.find('a', href=True).get('href', False)
|
||||
title = li.find('a', text=True).strip()
|
||||
if not title == u'動新聞':
|
||||
sectionList.append((title, a))
|
||||
for title, url in sectionList:
|
||||
articles = self.parse_section(url)
|
||||
if articles:
|
||||
feeds.append((title, articles))
|
||||
return feeds
|
||||
|
||||
#def feed_title(div):
|
||||
# return ''.join(div.findAll(text=True, recursive=False)).strip()
|
||||
def parse_section(self, url):
|
||||
soup = self.index_to_soup(url)
|
||||
ul = soup.find(attrs={'class':'list'})
|
||||
current_articles = []
|
||||
for li in ul.findAll('li'):
|
||||
a = li.find('a', href=True)
|
||||
title = li.find('p', text=True).strip()
|
||||
if a is not None:
|
||||
current_articles.append({'title': title, 'url':'http://hkm.appledaily.com/' + a.get('href', False)})
|
||||
pass
|
||||
return current_articles
|
||||
|
||||
articles = {}
|
||||
key = None
|
||||
ans = []
|
||||
for div in soup.findAll('li'):
|
||||
key = div.find(text=True, recursive=True);
|
||||
#if key == u'豪情':
|
||||
# continue;
|
||||
def create_opf(self, feeds, dir=None):
|
||||
if dir is None:
|
||||
dir = self.output_dir
|
||||
title = self.short_title()
|
||||
if self.output_profile.periodical_date_in_title:
|
||||
title += strftime(self.timefmt)
|
||||
mi = MetaInformation(title, [__appname__])
|
||||
mi.publisher = __appname__
|
||||
mi.author_sort = __appname__
|
||||
if self.publication_type:
|
||||
mi.publication_type = 'periodical:'+self.publication_type+':'+self.short_title()
|
||||
mi.timestamp = nowf()
|
||||
article_titles, aseen = [], set()
|
||||
for f in feeds:
|
||||
for a in f:
|
||||
if a.title and a.title not in aseen:
|
||||
aseen.add(a.title)
|
||||
article_titles.append(force_unicode(a.title, 'utf-8'))
|
||||
|
||||
print 'section=' + key
|
||||
mi.comments = self.description
|
||||
if not isinstance(mi.comments, unicode):
|
||||
mi.comments = mi.comments.decode('utf-8', 'replace')
|
||||
mi.comments += ('\n\n' + _('Articles in this issue: ') + '\n' +
|
||||
'\n\n'.join(article_titles))
|
||||
|
||||
articles[key] = []
|
||||
language = canonicalize_lang(self.language)
|
||||
if language is not None:
|
||||
mi.language = language
|
||||
# This one affects the pub date shown in kindle title
|
||||
#mi.pubdate = nowf()
|
||||
# now appears to need the time field to be > 12.00noon as well
|
||||
mi.pubdate = datetime.datetime(int(self.get_fetchyear()), int(self.get_fetchmonth()), int(self.get_fetchday()), 12, 30, 0)
|
||||
opf_path = os.path.join(dir, 'index.opf')
|
||||
ncx_path = os.path.join(dir, 'index.ncx')
|
||||
|
||||
ans.append(key)
|
||||
opf = OPFCreator(dir, mi)
|
||||
# Add mastheadImage entry to <guide> section
|
||||
mp = getattr(self, 'masthead_path', None)
|
||||
if mp is not None and os.access(mp, os.R_OK):
|
||||
from calibre.ebooks.metadata.opf2 import Guide
|
||||
ref = Guide.Reference(os.path.basename(self.masthead_path), os.getcwdu())
|
||||
ref.type = 'masthead'
|
||||
ref.title = 'Masthead Image'
|
||||
opf.guide.append(ref)
|
||||
|
||||
a = div.find('a', href=True)
|
||||
manifest = [os.path.join(dir, 'feed_%d'%i) for i in range(len(feeds))]
|
||||
manifest.append(os.path.join(dir, 'index.html'))
|
||||
manifest.append(os.path.join(dir, 'index.ncx'))
|
||||
|
||||
if not a:
|
||||
continue
|
||||
# Get cover
|
||||
cpath = getattr(self, 'cover_path', None)
|
||||
if cpath is None:
|
||||
pf = open(os.path.join(dir, 'cover.jpg'), 'wb')
|
||||
if self.default_cover(pf):
|
||||
cpath = pf.name
|
||||
if cpath is not None and os.access(cpath, os.R_OK):
|
||||
opf.cover = cpath
|
||||
manifest.append(cpath)
|
||||
|
||||
url = base + '/' + a['href']
|
||||
print 'url=' + url
|
||||
# Get masthead
|
||||
mpath = getattr(self, 'masthead_path', None)
|
||||
if mpath is not None and os.access(mpath, os.R_OK):
|
||||
manifest.append(mpath)
|
||||
|
||||
if not articles.has_key(key):
|
||||
articles[key] = []
|
||||
else:
|
||||
# sub page
|
||||
subSoup = self.index_to_soup(url)
|
||||
opf.create_manifest_from_files_in(manifest)
|
||||
for mani in opf.manifest:
|
||||
if mani.path.endswith('.ncx'):
|
||||
mani.id = 'ncx'
|
||||
if mani.path.endswith('mastheadImage.jpg'):
|
||||
mani.id = 'masthead-image'
|
||||
|
||||
for subDiv in subSoup.findAll('li'):
|
||||
subA = subDiv.find('a', href=True)
|
||||
subTitle = subDiv.find(text=True, recursive=True)
|
||||
subUrl = base + '/' + subA['href']
|
||||
|
||||
print 'subUrl' + subUrl
|
||||
|
||||
articles[key].append(
|
||||
dict(title=subTitle,
|
||||
url=subUrl,
|
||||
date='',
|
||||
description='',
|
||||
content=''))
|
||||
entries = ['index.html']
|
||||
toc = TOC(base_path=dir)
|
||||
self.play_order_counter = 0
|
||||
self.play_order_map = {}
|
||||
|
||||
|
||||
# elif div['class'] in ['story', 'story headline']:
|
||||
# a = div.find('a', href=True)
|
||||
# if not a:
|
||||
# continue
|
||||
# url = re.sub(r'\?.*', '', a['href'])
|
||||
# url += '?pagewanted=all'
|
||||
# title = self.tag_to_string(a, use_alt=True).strip()
|
||||
# description = ''
|
||||
# pubdate = strftime('%a, %d %b')
|
||||
# summary = div.find(True, attrs={'class':'summary'})
|
||||
# if summary:
|
||||
# description = self.tag_to_string(summary, use_alt=False)
|
||||
#
|
||||
# feed = key if key is not None else 'Uncategorized'
|
||||
# if not articles.has_key(feed):
|
||||
# articles[feed] = []
|
||||
# if not 'podcasts' in url:
|
||||
# articles[feed].append(
|
||||
# dict(title=title, url=url, date=pubdate,
|
||||
# description=description,
|
||||
# content=''))
|
||||
# ans = self.sort_index_by(ans, {'The Front Page':-1, 'Dining In, Dining Out':1, 'Obituaries':2})
|
||||
ans = [(unicode(key), articles[key]) for key in ans if articles.has_key(key)]
|
||||
return ans
|
||||
def feed_index(num, parent):
|
||||
f = feeds[num]
|
||||
for j, a in enumerate(f):
|
||||
if getattr(a, 'downloaded', False):
|
||||
adir = 'feed_%d/article_%d/'%(num, j)
|
||||
auth = a.author
|
||||
if not auth:
|
||||
auth = None
|
||||
desc = a.text_summary
|
||||
if not desc:
|
||||
desc = None
|
||||
else:
|
||||
desc = self.description_limiter(desc)
|
||||
tt = a.toc_thumbnail if a.toc_thumbnail else None
|
||||
entries.append('%sindex.html'%adir)
|
||||
po = self.play_order_map.get(entries[-1], None)
|
||||
if po is None:
|
||||
self.play_order_counter += 1
|
||||
po = self.play_order_counter
|
||||
parent.add_item('%sindex.html'%adir, None,
|
||||
a.title if a.title else _('Untitled Article'),
|
||||
play_order=po, author=auth,
|
||||
description=desc, toc_thumbnail=tt)
|
||||
last = os.path.join(self.output_dir, ('%sindex.html'%adir).replace('/', os.sep))
|
||||
for sp in a.sub_pages:
|
||||
prefix = os.path.commonprefix([opf_path, sp])
|
||||
relp = sp[len(prefix):]
|
||||
entries.append(relp.replace(os.sep, '/'))
|
||||
last = sp
|
||||
|
||||
if os.path.exists(last):
|
||||
with open(last, 'rb') as fi:
|
||||
src = fi.read().decode('utf-8')
|
||||
soup = BeautifulSoup(src)
|
||||
body = soup.find('body')
|
||||
if body is not None:
|
||||
prefix = '/'.join('..'for i in range(2*len(re.findall(r'link\d+', last))))
|
||||
templ = self.navbar.generate(True, num, j, len(f),
|
||||
not self.has_single_feed,
|
||||
a.orig_url, __appname__, prefix=prefix,
|
||||
center=self.center_navbar)
|
||||
elem = BeautifulSoup(templ.render(doctype='xhtml').decode('utf-8')).find('div')
|
||||
body.insert(len(body.contents), elem)
|
||||
with open(last, 'wb') as fi:
|
||||
fi.write(unicode(soup).encode('utf-8'))
|
||||
if len(feeds) == 0:
|
||||
raise Exception('All feeds are empty, aborting.')
|
||||
|
||||
if len(feeds) > 1:
|
||||
for i, f in enumerate(feeds):
|
||||
entries.append('feed_%d/index.html'%i)
|
||||
po = self.play_order_map.get(entries[-1], None)
|
||||
if po is None:
|
||||
self.play_order_counter += 1
|
||||
po = self.play_order_counter
|
||||
auth = getattr(f, 'author', None)
|
||||
if not auth:
|
||||
auth = None
|
||||
desc = getattr(f, 'description', None)
|
||||
if not desc:
|
||||
desc = None
|
||||
feed_index(i, toc.add_item('feed_%d/index.html'%i, None,
|
||||
f.title, play_order=po, description=desc, author=auth))
|
||||
|
||||
else:
|
||||
entries.append('feed_%d/index.html'%0)
|
||||
feed_index(0, toc)
|
||||
|
||||
for i, p in enumerate(entries):
|
||||
entries[i] = os.path.join(dir, p.replace('/', os.sep))
|
||||
opf.create_spine(entries)
|
||||
opf.set_toc(toc)
|
||||
|
||||
with nested(open(opf_path, 'wb'), open(ncx_path, 'wb')) as (opf_file, ncx_file):
|
||||
opf.render(opf_file, ncx_file)
|
||||
|
||||
|
@ -9,14 +9,14 @@ class AdvancedUserRecipe1306097511(BasicNewsRecipe):
|
||||
__author__ = 'Dave Asbury'
|
||||
cover_url = 'http://profile.ak.fbcdn.net/hprofile-ak-snc4/161987_9010212100_2035706408_n.jpg'
|
||||
oldest_article = 2
|
||||
max_articles_per_feed = 12
|
||||
max_articles_per_feed = 20
|
||||
linearize_tables = True
|
||||
remove_empty_feeds = True
|
||||
remove_javascript = True
|
||||
no_stylesheets = True
|
||||
auto_cleanup = True
|
||||
language = 'en_GB'
|
||||
|
||||
compress_news_images = True
|
||||
cover_url = 'http://profile.ak.fbcdn.net/hprofile-ak-snc4/161987_9010212100_2035706408_n.jpg'
|
||||
|
||||
masthead_url = 'http://www.trinitymirror.com/images/birminghampost-logo.gif'
|
||||
|
@ -1,3 +1,4 @@
|
||||
import re
|
||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||
from collections import OrderedDict
|
||||
|
||||
@ -39,7 +40,7 @@ class BusinessWeekMagazine(BasicNewsRecipe):
|
||||
title=self.tag_to_string(div.a).strip()
|
||||
url=div.a['href']
|
||||
soup0 = self.index_to_soup(url)
|
||||
urlprint=soup0.find('li', attrs={'class':'print tracked'}).a['href']
|
||||
urlprint=soup0.find('a', attrs={'href':re.compile('.*printer.*')})['href']
|
||||
articles.append({'title':title, 'url':urlprint, 'description':'', 'date':''})
|
||||
|
||||
|
||||
@ -56,7 +57,7 @@ class BusinessWeekMagazine(BasicNewsRecipe):
|
||||
title=self.tag_to_string(div.a).strip()
|
||||
url=div.a['href']
|
||||
soup0 = self.index_to_soup(url)
|
||||
urlprint=soup0.find('li', attrs={'class':'print tracked'}).a['href']
|
||||
urlprint=soup0.find('a', attrs={'href':re.compile('.*printer.*')})['href']
|
||||
articles.append({'title':title, 'url':urlprint, 'description':desc, 'date':''})
|
||||
|
||||
if articles:
|
||||
|
@ -7,13 +7,14 @@ class AdvancedUserRecipe1325006965(BasicNewsRecipe):
|
||||
#cover_url = 'http://www.countryfile.com/sites/default/files/imagecache/160px_wide/cover/2_1.jpg'
|
||||
__author__ = 'Dave Asbury'
|
||||
description = 'The official website of Countryfile Magazine'
|
||||
# last updated 8/12/12
|
||||
# last updated 19/10/12
|
||||
language = 'en_GB'
|
||||
oldest_article = 30
|
||||
max_articles_per_feed = 25
|
||||
remove_empty_feeds = True
|
||||
no_stylesheets = True
|
||||
auto_cleanup = True
|
||||
compress_news_images = True
|
||||
ignore_duplicate_articles = {'title', 'url'}
|
||||
#articles_are_obfuscated = True
|
||||
#article_already_exists = False
|
||||
|
@ -13,9 +13,9 @@ class AdvancedUserRecipe1306061239(BasicNewsRecipe):
|
||||
|
||||
masthead_url = 'http://www.nmauk.co.uk/nma/images/daily_mirror.gif'
|
||||
|
||||
|
||||
compress_news_images = True
|
||||
oldest_article = 1
|
||||
max_articles_per_feed = 1
|
||||
max_articles_per_feed = 12
|
||||
remove_empty_feeds = True
|
||||
remove_javascript = True
|
||||
no_stylesheets = True
|
||||
|
23
recipes/diario_de_noticias.recipe
Normal file
23
recipes/diario_de_noticias.recipe
Normal file
@ -0,0 +1,23 @@
|
||||
# vim:fileencoding=UTF-8
|
||||
|
||||
from __future__ import unicode_literals
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class AdvancedUserRecipe1365070687(BasicNewsRecipe):
|
||||
title ='Diário de Notícias'
|
||||
oldest_article = 7
|
||||
language = 'pt'
|
||||
__author__ = 'Jose Pinto'
|
||||
max_articles_per_feed = 100
|
||||
keep_only_tags = [dict(name='div', attrs={'id':'cln-esqmid'}) ]
|
||||
remove_tags = [ dict(name='table', attrs={'class':'TabFerramentasInf'}) ]
|
||||
|
||||
feeds = [(u'Portugal', u'http://feeds.dn.pt/DN-Portugal'),
|
||||
(u'Globo', u'http://feeds.dn.pt/DN-Globo'),
|
||||
(u'Economia', u'http://feeds.dn.pt/DN-Economia'),
|
||||
(u'Ci\xeancia', u'http://feeds.dn.pt/DN-Ciencia'),
|
||||
(u'Artes', u'http://feeds.dn.pt/DN-Artes'),
|
||||
(u'TV & Media', u'http://feeds.dn.pt/DN-Media'),
|
||||
(u'Opini\xe3o', u'http://feeds.dn.pt/DN-Opiniao'),
|
||||
(u'Pessoas', u'http://feeds.dn.pt/DN-Pessoas')
|
||||
]
|
17
recipes/economia.recipe
Normal file
17
recipes/economia.recipe
Normal file
@ -0,0 +1,17 @@
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class AdvancedUserRecipe1314326622(BasicNewsRecipe):
|
||||
title = u'Economia'
|
||||
__author__ = 'Manish Bhattarai'
|
||||
description = 'Economia - Intelligence & Insight for ICAEW Members'
|
||||
language = 'en_GB'
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 25
|
||||
masthead_url = 'http://economia.icaew.com/~/media/Images/Design%20Images/Economia_Red_website.ashx'
|
||||
cover_url = 'http://economia.icaew.com/~/media/Images/Design%20Images/Economia_Red_website.ashx'
|
||||
no_stylesheets = True
|
||||
remove_empty_feeds = True
|
||||
remove_tags_before = dict(id='content')
|
||||
remove_tags_after = dict(id='stars-wrapper')
|
||||
remove_tags = [dict(attrs={'class':['floatR', 'sharethis', 'rating clearfix']})]
|
||||
feeds = [(u'News', u'http://feedity.com/icaew-com/VlNTVFRa.rss'),(u'Business', u'http://feedity.com/icaew-com/VlNTVFtS.rss'),(u'People', u'http://feedity.com/icaew-com/VlNTVFtX.rss'),(u'Opinion', u'http://feedity.com/icaew-com/VlNTVFtW.rss'),(u'Finance', u'http://feedity.com/icaew-com/VlNTVFtV.rss')]
|
@ -8,6 +8,7 @@ import datetime
|
||||
from calibre.ptempfile import PersistentTemporaryFile
|
||||
from calibre import strftime
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
from collections import OrderedDict
|
||||
|
||||
class FinancialTimes(BasicNewsRecipe):
|
||||
title = 'Financial Times (UK)'
|
||||
@ -93,7 +94,7 @@ class FinancialTimes(BasicNewsRecipe):
|
||||
try:
|
||||
urlverified = self.browser.open_novisit(url).geturl() # resolve redirect.
|
||||
except:
|
||||
continue
|
||||
continue
|
||||
title = self.tag_to_string(item)
|
||||
date = strftime(self.timefmt)
|
||||
articles.append({
|
||||
@ -105,29 +106,30 @@ class FinancialTimes(BasicNewsRecipe):
|
||||
return articles
|
||||
|
||||
def parse_index(self):
|
||||
feeds = []
|
||||
feeds = OrderedDict()
|
||||
soup = self.index_to_soup(self.INDEX)
|
||||
dates= self.tag_to_string(soup.find('div', attrs={'class':'btm-links'}).find('div'))
|
||||
self.timefmt = ' [%s]'%dates
|
||||
wide = soup.find('div',attrs={'class':'wide'})
|
||||
if not wide:
|
||||
return feeds
|
||||
allsections = wide.findAll(attrs={'class':lambda x: x and 'footwell' in x.split()})
|
||||
if not allsections:
|
||||
return feeds
|
||||
count = 0
|
||||
for item in allsections:
|
||||
count = count + 1
|
||||
if self.test and count > 2:
|
||||
return feeds
|
||||
fitem = item.h3
|
||||
if not fitem:
|
||||
fitem = item.h4
|
||||
ftitle = self.tag_to_string(fitem)
|
||||
self.report_progress(0, _('Fetching feed')+' %s...'%(ftitle))
|
||||
feedarts = self.get_artlinks(item.ul)
|
||||
feeds.append((ftitle,feedarts))
|
||||
return feeds
|
||||
#dates= self.tag_to_string(soup.find('div', attrs={'class':'btm-links'}).find('div'))
|
||||
#self.timefmt = ' [%s]'%dates
|
||||
section_title = 'Untitled'
|
||||
|
||||
for column in soup.findAll('div', attrs = {'class':'feedBoxes clearfix'}):
|
||||
for section in column. findAll('div', attrs = {'class':'feedBox'}):
|
||||
sectiontitle=self.tag_to_string(section.find('h4'))
|
||||
if '...' not in sectiontitle: section_title=sectiontitle
|
||||
for article in section.ul.findAll('li'):
|
||||
articles = []
|
||||
title=self.tag_to_string(article.a)
|
||||
url=article.a['href']
|
||||
articles.append({'title':title, 'url':url, 'description':'', 'date':''})
|
||||
|
||||
if articles:
|
||||
if section_title not in feeds:
|
||||
feeds[section_title] = []
|
||||
feeds[section_title] += articles
|
||||
|
||||
|
||||
ans = [(key, val) for key, val in feeds.iteritems()]
|
||||
return ans
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
items = ['promo-box','promo-title',
|
||||
@ -174,9 +176,6 @@ class FinancialTimes(BasicNewsRecipe):
|
||||
count += 1
|
||||
tfile = PersistentTemporaryFile('_fa.html')
|
||||
tfile.write(html)
|
||||
tfile.close()
|
||||
tfile.close()
|
||||
self.temp_files.append(tfile)
|
||||
return tfile.name
|
||||
|
||||
def cleanup(self):
|
||||
self.browser.open('https://registration.ft.com/registration/login/logout?location=')
|
@ -5,7 +5,6 @@ __license__ = 'GPL v3'
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
import datetime
|
||||
import re
|
||||
from calibre.ebooks.BeautifulSoup import Comment
|
||||
|
||||
class forbes_pl(BasicNewsRecipe):
|
||||
title = u'Forbes.pl'
|
||||
@ -26,9 +25,9 @@ class forbes_pl(BasicNewsRecipe):
|
||||
pages_count = 4
|
||||
keep_only_tags = [dict(attrs={'class':['Block-Node Content-Article ', 'Block-Node Content-Article piano-closed']})]
|
||||
remove_tags = [dict(attrs={'class':['Keywords Styled', 'twitter-share-button', 'Block-List-Related Block-List']})]
|
||||
|
||||
|
||||
feeds = [(u'Wszystkie', 'http://www.forbes.pl/rss')]
|
||||
|
||||
|
||||
'''def preprocess_html(self, soup):
|
||||
self.append_page(soup, soup.body)
|
||||
return soup
|
||||
@ -51,4 +50,4 @@ class forbes_pl(BasicNewsRecipe):
|
||||
appendtag.insert(pos, pagetext)
|
||||
if cleanup:
|
||||
for r in appendtag.findAll(attrs={'class':'paginator'}):
|
||||
r.extract()'''
|
||||
r.extract()'''
|
||||
|
108
recipes/galaxys_edge.recipe
Normal file
108
recipes/galaxys_edge.recipe
Normal file
@ -0,0 +1,108 @@
|
||||
from __future__ import with_statement
|
||||
__license__ = 'GPL 3'
|
||||
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class GalaxyEdge(BasicNewsRecipe):
|
||||
title = u'The Galaxy\'s Edge'
|
||||
language = 'en'
|
||||
|
||||
oldest_article = 7
|
||||
__author__ = 'Krittika Goyal'
|
||||
no_stylesheets = True
|
||||
|
||||
auto_cleanup = True
|
||||
|
||||
#keep_only_tags = [dict(id='content')]
|
||||
#remove_tags = [dict(attrs={'class':['article-links', 'breadcr']}),
|
||||
#dict(id=['email-section', 'right-column', 'printfooter', 'topover',
|
||||
#'slidebox', 'th_footer'])]
|
||||
|
||||
extra_css = '.photo-caption { font-size: smaller }'
|
||||
|
||||
def parse_index(self):
|
||||
soup = self.index_to_soup('http://www.galaxysedge.com/')
|
||||
main = soup.find('table', attrs={'width':'911'})
|
||||
toc = main.find('td', attrs={'width':'225'})
|
||||
|
||||
|
||||
|
||||
current_section = None
|
||||
current_articles = []
|
||||
feeds = []
|
||||
c = 0
|
||||
for x in toc.findAll(['p']):
|
||||
c = c+1
|
||||
if c == 5:
|
||||
if current_articles and current_section:
|
||||
feeds.append((current_section, current_articles))
|
||||
edwo = x.find('a')
|
||||
current_section = self.tag_to_string(edwo)
|
||||
current_articles = []
|
||||
self.log('\tFound section:', current_section)
|
||||
title = self.tag_to_string(edwo)
|
||||
url = edwo.get('href', True)
|
||||
url = 'http://www.galaxysedge.com/'+url
|
||||
print(title)
|
||||
print(c)
|
||||
if not url or not title:
|
||||
continue
|
||||
self.log('\t\tFound article:', title)
|
||||
self.log('\t\t\t', url)
|
||||
current_articles.append({'title': title, 'url':url,
|
||||
'description':'', 'date':''})
|
||||
elif c>5:
|
||||
current_section = self.tag_to_string(x.find('b'))
|
||||
current_articles = []
|
||||
self.log('\tFound section:', current_section)
|
||||
for y in x.findAll('a'):
|
||||
title = self.tag_to_string(y)
|
||||
url = y.get('href', True)
|
||||
url = 'http://www.galaxysedge.com/'+url
|
||||
print(title)
|
||||
if not url or not title:
|
||||
continue
|
||||
self.log('\t\tFound article:', title)
|
||||
self.log('\t\t\t', url)
|
||||
current_articles.append({'title': title, 'url':url,
|
||||
'description':'', 'date':''})
|
||||
if current_articles and current_section:
|
||||
feeds.append((current_section, current_articles))
|
||||
|
||||
return feeds
|
||||
|
||||
|
||||
|
||||
|
||||
#def preprocess_raw_html(self, raw, url):
|
||||
#return raw.replace('<body><p>', '<p>').replace('</p></body>', '</p>')
|
||||
|
||||
#def postprocess_html(self, soup, first_fetch):
|
||||
#for t in soup.findAll(['table', 'tr', 'td','center']):
|
||||
#t.name = 'div'
|
||||
#return soup
|
||||
|
||||
#def parse_index(self):
|
||||
#today = time.strftime('%Y-%m-%d')
|
||||
#soup = self.index_to_soup(
|
||||
#'http://www.thehindu.com/todays-paper/tp-index/?date=' + today)
|
||||
#div = soup.find(id='left-column')
|
||||
#feeds = []
|
||||
#current_section = None
|
||||
#current_articles = []
|
||||
#for x in div.findAll(['h3', 'div']):
|
||||
#if current_section and x.get('class', '') == 'tpaper':
|
||||
#a = x.find('a', href=True)
|
||||
#if a is not None:
|
||||
#current_articles.append({'url':a['href']+'?css=print',
|
||||
#'title':self.tag_to_string(a), 'date': '',
|
||||
#'description':''})
|
||||
#if x.name == 'h3':
|
||||
#if current_section and current_articles:
|
||||
#feeds.append((current_section, current_articles))
|
||||
#current_section = self.tag_to_string(x)
|
||||
#current_articles = []
|
||||
#return feeds
|
||||
|
||||
|
@ -1,6 +1,4 @@
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
import re
|
||||
from datetime import date, timedelta
|
||||
|
||||
class HBR(BasicNewsRecipe):
|
||||
|
||||
@ -11,23 +9,18 @@ class HBR(BasicNewsRecipe):
|
||||
timefmt = ' [%B %Y]'
|
||||
language = 'en'
|
||||
no_stylesheets = True
|
||||
# recipe_disabled = ('hbr.org has started requiring the use of javascript'
|
||||
# ' to log into their website. This is unsupported in calibre, so'
|
||||
# ' this recipe has been disabled. If you would like to see '
|
||||
# ' HBR supported in calibre, contact hbr.org and ask them'
|
||||
# ' to provide a javascript free login method.')
|
||||
|
||||
LOGIN_URL = 'https://hbr.org/login?request_url=/'
|
||||
LOGOUT_URL = 'https://hbr.org/logout?request_url=/'
|
||||
|
||||
INDEX = 'http://hbr.org/archive-toc/BR'
|
||||
INDEX = 'http://hbr.org'
|
||||
|
||||
keep_only_tags = [dict(name='div', id='pageContainer')]
|
||||
remove_tags = [dict(id=['mastheadContainer', 'magazineHeadline',
|
||||
'articleToolbarTopRD', 'pageRightSubColumn', 'pageRightColumn',
|
||||
'todayOnHBRListWidget', 'mostWidget', 'keepUpWithHBR',
|
||||
'mailingListTout', 'partnerCenter', 'pageFooter',
|
||||
'superNavHeadContainer', 'hbrDisqus',
|
||||
'superNavHeadContainer', 'hbrDisqus', 'article-toolbox',
|
||||
'articleToolbarTop', 'articleToolbarBottom', 'articleToolbarRD']),
|
||||
dict(name='iframe')]
|
||||
extra_css = '''
|
||||
@ -57,22 +50,6 @@ class HBR(BasicNewsRecipe):
|
||||
if url.endswith('/ar/1'):
|
||||
return url[:-1]+'pr'
|
||||
|
||||
def hbr_get_toc(self):
|
||||
# return self.index_to_soup(open('/t/toc.html').read())
|
||||
|
||||
today = date.today()
|
||||
future = today + timedelta(days=30)
|
||||
past = today - timedelta(days=30)
|
||||
for x in [x.strftime('%y%m') for x in (future, today, past)]:
|
||||
url = self.INDEX + x
|
||||
soup = self.index_to_soup(url)
|
||||
if (not soup.find(text='Issue Not Found') and not soup.find(
|
||||
text="We're Sorry. There was an error processing your request")
|
||||
and 'Exception: java.io.FileNotFoundException' not in
|
||||
unicode(soup)):
|
||||
return soup
|
||||
raise Exception('Could not find current issue')
|
||||
|
||||
def hbr_parse_toc(self, soup):
|
||||
feeds = []
|
||||
current_section = None
|
||||
@ -105,23 +82,19 @@ class HBR(BasicNewsRecipe):
|
||||
|
||||
articles.append({'title':title, 'url':url, 'description':desc,
|
||||
'date':''})
|
||||
|
||||
if current_section is not None and articles:
|
||||
feeds.append((current_section, articles))
|
||||
return feeds
|
||||
|
||||
def parse_index(self):
|
||||
soup = self.hbr_get_toc()
|
||||
# open('/t/hbr.html', 'wb').write(unicode(soup).encode('utf-8'))
|
||||
soup0 = self.index_to_soup('http://hbr.org/magazine')
|
||||
datencover = soup0.find('ul', attrs={'id':'magazineArchiveCarousel'}).findAll('li')[-1]
|
||||
#find date & cover
|
||||
self.cover_url=datencover.img['src']
|
||||
dates=self.tag_to_string(datencover.img['alt'])
|
||||
self.timefmt = u' [%s]'%dates
|
||||
soup = self.index_to_soup(self.INDEX + soup0.find('div', attrs = {'class':'magazine_page'}).a['href'])
|
||||
feeds = self.hbr_parse_toc(soup)
|
||||
return feeds
|
||||
|
||||
def get_cover_url(self):
|
||||
cover_url = None
|
||||
index = 'http://hbr.org/current'
|
||||
soup = self.index_to_soup(index)
|
||||
link_item = soup.find('img', alt=re.compile("Current Issue"), src=True)
|
||||
|
||||
if link_item:
|
||||
cover_url = 'http://hbr.org' + link_item['src']
|
||||
|
||||
return cover_url
|
||||
|
||||
|
||||
|
@ -1,33 +1,23 @@
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
# vim:fileencoding=UTF-8
|
||||
from __future__ import unicode_literals
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class AListApart (BasicNewsRecipe):
|
||||
__author__ = u'Marc Busqué <marc@lamarciana.com>'
|
||||
__author__ = 'Marc Busqué <marc@lamarciana.com>'
|
||||
__url__ = 'http://www.lamarciana.com'
|
||||
__version__ = '1.0'
|
||||
__version__ = '2.0'
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = u'2012, Marc Busqué <marc@lamarciana.com>'
|
||||
__copyright__ = '2012, Marc Busqué <marc@lamarciana.com>'
|
||||
title = u'A List Apart'
|
||||
description = u'A List Apart Magazine (ISSN: 1534-0295) explores the design, development, and meaning of web content, with a special focus on web standards and best practices.'
|
||||
description = u'A List Apart Magazine (ISSN: 1534-0295) explores the design, development, and meaning of web content, with a special focus on web standards and best practices. This recipe retrieve articles and columns.'
|
||||
language = 'en'
|
||||
tags = 'web development, software'
|
||||
oldest_article = 120
|
||||
remove_empty_feeds = True
|
||||
no_stylesheets = True
|
||||
encoding = 'utf8'
|
||||
cover_url = u'http://alistapart.com/pix/alalogo.gif'
|
||||
keep_only_tags = [
|
||||
dict(name='div', attrs={'id': 'content'})
|
||||
]
|
||||
remove_tags = [
|
||||
dict(name='ul', attrs={'id': 'metastuff'}),
|
||||
dict(name='div', attrs={'class': 'discuss'}),
|
||||
dict(name='div', attrs={'class': 'discuss'}),
|
||||
dict(name='div', attrs={'id': 'learnmore'}),
|
||||
]
|
||||
remove_attributes = ['border', 'cellspacing', 'align', 'cellpadding', 'colspan', 'valign', 'vspace', 'hspace', 'alt', 'width', 'height']
|
||||
extra_css = u'img {max-width: 100%; display: block; margin: auto;} #authorbio img {float: left; margin-right: 2%;}'
|
||||
extra_css = u'img {max-width: 100%; display: block; margin: auto;}'
|
||||
|
||||
feeds = [
|
||||
(u'A List Apart', u'http://www.alistapart.com/site/rss'),
|
||||
(u'A List Apart', u'http://feeds.feedburner.com/alistapart/abridged'),
|
||||
]
|
||||
|
@ -6,10 +6,10 @@ import time
|
||||
|
||||
class AdvancedUserRecipe1306097511(BasicNewsRecipe):
|
||||
title = u'Metro UK'
|
||||
description = 'News as provided by The Metro -UK'
|
||||
description = 'News from The Metro, UK'
|
||||
#timefmt = ''
|
||||
__author__ = 'fleclerc & Dave Asbury'
|
||||
#last update 20/1/13
|
||||
__author__ = 'Dave Asbury'
|
||||
#last update 4/4/13
|
||||
#cover_url = 'http://profile.ak.fbcdn.net/hprofile-ak-snc4/276636_117118184990145_2132092232_n.jpg'
|
||||
|
||||
cover_url = 'https://twimg0-a.akamaihd.net/profile_images/1638332595/METRO_LETTERS-01.jpg'
|
||||
@ -22,7 +22,7 @@ class AdvancedUserRecipe1306097511(BasicNewsRecipe):
|
||||
|
||||
language = 'en_GB'
|
||||
masthead_url = 'http://e-edition.metro.co.uk/images/metro_logo.gif'
|
||||
|
||||
compress_news_images = True
|
||||
def parse_index(self):
|
||||
articles = {}
|
||||
key = None
|
||||
|
@ -1,64 +1,44 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2008-2013, Darko Miletic <darko.miletic at gmail.com>'
|
||||
'''
|
||||
newyorker.com
|
||||
'''
|
||||
|
||||
'''
|
||||
www.canada.com
|
||||
'''
|
||||
import re
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup
|
||||
|
||||
class NewYorker(BasicNewsRecipe):
|
||||
title = 'The New Yorker'
|
||||
__author__ = 'Darko Miletic'
|
||||
description = 'The best of US journalism'
|
||||
oldest_article = 15
|
||||
language = 'en'
|
||||
max_articles_per_feed = 100
|
||||
no_stylesheets = True
|
||||
use_embedded_content = False
|
||||
publisher = 'Conde Nast Publications'
|
||||
category = 'news, politics, USA'
|
||||
encoding = 'cp1252'
|
||||
publication_type = 'magazine'
|
||||
masthead_url = 'http://www.newyorker.com/css/i/hed/logo.gif'
|
||||
extra_css = """
|
||||
body {font-family: "Times New Roman",Times,serif}
|
||||
.articleauthor{color: #9F9F9F;
|
||||
font-family: Arial, sans-serif;
|
||||
font-size: small;
|
||||
text-transform: uppercase}
|
||||
.rubric,.dd,h6#credit{color: #CD0021;
|
||||
font-family: Arial, sans-serif;
|
||||
font-size: small;
|
||||
text-transform: uppercase}
|
||||
.descender:first-letter{display: inline; font-size: xx-large; font-weight: bold}
|
||||
.dd,h6#credit{color: gray}
|
||||
.c{display: block}
|
||||
.caption,h2#articleintro{font-style: italic}
|
||||
.caption{font-size: small}
|
||||
"""
|
||||
|
||||
conversion_options = {
|
||||
'comment' : description
|
||||
, 'tags' : category
|
||||
, 'publisher' : publisher
|
||||
, 'language' : language
|
||||
}
|
||||
|
||||
keep_only_tags = [dict(name='div', attrs={'id':'pagebody'})]
|
||||
remove_tags = [
|
||||
dict(name=['meta','iframe','base','link','embed','object'])
|
||||
,dict(attrs={'class':['utils','socialUtils','articleRailLinks','icons','social-utils-top','entry-keywords','entry-categories','utilsPrintEmail'] })
|
||||
,dict(attrs={'id':['show-header','show-footer'] })
|
||||
]
|
||||
remove_tags_after = dict(attrs={'class':'entry-content'})
|
||||
remove_attributes = ['lang']
|
||||
feeds = [(u'The New Yorker', u'http://www.newyorker.com/services/mrss/feeds/everything.xml')]
|
||||
title = u'New Yorker Magazine'
|
||||
newyorker_prefix = 'http://m.newyorker.com'
|
||||
description = u'Content from the New Yorker website'
|
||||
fp_tag = 'CAN_TC'
|
||||
|
||||
def print_version(self, url):
|
||||
return url + '?printable=true¤tPage=all'
|
||||
masthead_url = 'http://www.newyorker.com/images/elements/print/newyorker_printlogo.gif'
|
||||
|
||||
def image_url_processor(self, baseurl, url):
|
||||
return url.strip()
|
||||
compress_news_images = True
|
||||
compress_news_images_auto_size = 8
|
||||
scale_news_images_to_device = False
|
||||
scale_news_images = (768, 1024)
|
||||
|
||||
url_list = []
|
||||
language = 'en'
|
||||
__author__ = 'Nick Redding'
|
||||
no_stylesheets = True
|
||||
timefmt = ' [%b %d]'
|
||||
encoding = 'utf-8'
|
||||
extra_css = '''
|
||||
.byline { font-size:xx-small; font-weight: bold;}
|
||||
h3 { margin-bottom: 6px; }
|
||||
.caption { font-size: xx-small; font-style: italic; font-weight: normal; }
|
||||
'''
|
||||
keep_only_tags = [dict(name='div', attrs={'id':re.compile('pagebody')})]
|
||||
|
||||
remove_tags = [{'class':'socialUtils'},{'class':'entry-keywords'}]
|
||||
|
||||
def get_cover_url(self):
|
||||
cover_url = "http://www.newyorker.com/images/covers/1925/1925_02_21_p233.jpg"
|
||||
@ -68,13 +48,233 @@ class NewYorker(BasicNewsRecipe):
|
||||
cover_url = 'http://www.newyorker.com' + cover_item.div.img['src'].strip()
|
||||
return cover_url
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
for item in soup.findAll(style=True):
|
||||
del item['style']
|
||||
auth = soup.find(attrs={'id':'articleauthor'})
|
||||
if auth:
|
||||
alink = auth.find('a')
|
||||
if alink and alink.string is not None:
|
||||
txt = alink.string
|
||||
alink.replaceWith(txt)
|
||||
def fixChars(self,string):
|
||||
# Replace lsquo (\x91)
|
||||
fixed = re.sub("\x91","‘",string)
|
||||
# Replace rsquo (\x92)
|
||||
fixed = re.sub("\x92","’",fixed)
|
||||
# Replace ldquo (\x93)
|
||||
fixed = re.sub("\x93","“",fixed)
|
||||
# Replace rdquo (\x94)
|
||||
fixed = re.sub("\x94","”",fixed)
|
||||
# Replace ndash (\x96)
|
||||
fixed = re.sub("\x96","–",fixed)
|
||||
# Replace mdash (\x97)
|
||||
fixed = re.sub("\x97","—",fixed)
|
||||
fixed = re.sub("’","’",fixed)
|
||||
return fixed
|
||||
|
||||
def massageNCXText(self, description):
|
||||
# Kindle TOC descriptions won't render certain characters
|
||||
if description:
|
||||
massaged = unicode(BeautifulStoneSoup(description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES))
|
||||
# Replace '&' with '&'
|
||||
massaged = re.sub("&","&", massaged)
|
||||
return self.fixChars(massaged)
|
||||
else:
|
||||
return description
|
||||
|
||||
def populate_article_metadata(self, article, soup, first):
|
||||
if first:
|
||||
picdiv = soup.find('body').find('img')
|
||||
if picdiv is not None:
|
||||
self.add_toc_thumbnail(article,re.sub(r'links\\link\d+\\','',picdiv['src']))
|
||||
xtitle = article.text_summary.strip()
|
||||
if len(xtitle) == 0:
|
||||
desc = soup.find('meta',attrs={'property':'og:description'})
|
||||
if desc is not None:
|
||||
article.summary = article.text_summary = desc['content']
|
||||
shortparagraph = ""
|
||||
## try:
|
||||
if len(article.text_summary.strip()) == 0:
|
||||
articlebodies = soup.findAll('div',attrs={'class':'entry-content'})
|
||||
if articlebodies:
|
||||
for articlebody in articlebodies:
|
||||
if articlebody:
|
||||
paras = articlebody.findAll('p')
|
||||
for p in paras:
|
||||
refparagraph = self.massageNCXText(self.tag_to_string(p,use_alt=False)).strip()
|
||||
#account for blank paragraphs and short paragraphs by appending them to longer ones
|
||||
if len(refparagraph) > 0:
|
||||
if len(refparagraph) > 70: #approximately one line of text
|
||||
newpara = shortparagraph + refparagraph
|
||||
article.summary = article.text_summary = newpara.strip()
|
||||
return
|
||||
else:
|
||||
shortparagraph = refparagraph + " "
|
||||
if shortparagraph.strip().find(" ") == -1 and not shortparagraph.strip().endswith(":"):
|
||||
shortparagraph = shortparagraph + "- "
|
||||
else:
|
||||
article.summary = article.text_summary = self.massageNCXText(article.text_summary)
|
||||
## except:
|
||||
## self.log("Error creating article descriptions")
|
||||
## return
|
||||
|
||||
|
||||
def strip_anchors(self,soup):
|
||||
paras = soup.findAll(True)
|
||||
for para in paras:
|
||||
aTags = para.findAll('a')
|
||||
for a in aTags:
|
||||
if a.img is None:
|
||||
a.replaceWith(a.renderContents().decode('cp1252','replace'))
|
||||
return soup
|
||||
|
||||
def preprocess_html(self,soup):
|
||||
dateline = soup.find('div','published')
|
||||
byline = soup.find('div','byline')
|
||||
title = soup.find('h1','entry-title')
|
||||
if title is None:
|
||||
return self.strip_anchors(soup)
|
||||
if byline is None:
|
||||
title.append(dateline)
|
||||
return self.strip_anchors(soup)
|
||||
byline.append(dateline)
|
||||
return self.strip_anchors(soup)
|
||||
|
||||
def load_global_nav(self,soup):
|
||||
seclist = []
|
||||
ul = soup.find('ul',attrs={'id':re.compile('global-nav-menu')})
|
||||
if ul is not None:
|
||||
for li in ul.findAll('li'):
|
||||
if li.a is not None:
|
||||
securl = li.a['href']
|
||||
if securl != '/' and securl != '/magazine' and securl.startswith('/'):
|
||||
seclist.append((self.tag_to_string(li.a),self.newyorker_prefix+securl))
|
||||
return seclist
|
||||
|
||||
def exclude_url(self,url):
|
||||
if url in self.url_list:
|
||||
return True
|
||||
if not url.endswith('html'):
|
||||
return True
|
||||
if 'goings-on-about-town-app' in url:
|
||||
return True
|
||||
if 'something-to-be-thankful-for' in url:
|
||||
return True
|
||||
if '/shouts/' in url:
|
||||
return True
|
||||
if 'out-loud' in url:
|
||||
return True
|
||||
if '/rss/' in url:
|
||||
return True
|
||||
if '/video-' in url:
|
||||
return True
|
||||
self.url_list.append(url)
|
||||
return False
|
||||
|
||||
|
||||
def load_index_page(self,soup):
|
||||
article_list = []
|
||||
for div in soup.findAll('div',attrs={'class':re.compile('^rotator')}):
|
||||
h2 = div.h2
|
||||
if h2 is not None:
|
||||
a = h2.a
|
||||
if a is not None:
|
||||
url = a['href']
|
||||
if not self.exclude_url(url):
|
||||
if url.startswith('/'):
|
||||
url = self.newyorker_prefix+url
|
||||
byline = h2.span
|
||||
if byline is not None:
|
||||
author = self.tag_to_string(byline)
|
||||
if author.startswith('by '):
|
||||
author.replace('by ','')
|
||||
byline.extract()
|
||||
else:
|
||||
author = ''
|
||||
if h2.br is not None:
|
||||
h2.br.replaceWith(' ')
|
||||
title = self.tag_to_string(h2)
|
||||
desc = div.find(attrs={'class':['rotator-ad-body','feature-blurb-text']})
|
||||
if desc is not None:
|
||||
description = self.tag_to_string(desc)
|
||||
else:
|
||||
description = ''
|
||||
article_list.append(dict(title=title,url=url,date='',description=description,author=author,content=''))
|
||||
ul = div.find('ul','feature-blurb-links')
|
||||
if ul is not None:
|
||||
for li in ul.findAll('li'):
|
||||
a = li.a
|
||||
if a is not None:
|
||||
url = a['href']
|
||||
if not self.exclude_url(url):
|
||||
if url.startswith('/'):
|
||||
url = self.newyorker_prefix+url
|
||||
if a.br is not None:
|
||||
a.br.replaceWith(' ')
|
||||
title = '>>'+self.tag_to_string(a)
|
||||
article_list.append(dict(title=title,url=url,date='',description='',author='',content=''))
|
||||
for h3 in soup.findAll('h3','header'):
|
||||
a = h3.a
|
||||
if a is not None:
|
||||
url = a['href']
|
||||
if not self.exclude_url(url):
|
||||
if url.startswith('/'):
|
||||
url = self.newyorker_prefix+url
|
||||
byline = h3.span
|
||||
if byline is not None:
|
||||
author = self.tag_to_string(byline)
|
||||
if author.startswith('by '):
|
||||
author = author.replace('by ','')
|
||||
byline.extract()
|
||||
else:
|
||||
author = ''
|
||||
if h3.br is not None:
|
||||
h3.br.replaceWith(' ')
|
||||
title = self.tag_to_string(h3).strip()
|
||||
article_list.append(dict(title=title,url=url,date='',description='',author=author,content=''))
|
||||
return article_list
|
||||
|
||||
def load_global_section(self,securl):
|
||||
article_list = []
|
||||
try:
|
||||
soup = self.index_to_soup(securl)
|
||||
except:
|
||||
return article_list
|
||||
if '/blogs/' not in securl:
|
||||
return self.load_index_page(soup)
|
||||
for div in soup.findAll('div',attrs={'id':re.compile('^entry')}):
|
||||
h3 = div.h3
|
||||
if h3 is not None:
|
||||
a = h3.a
|
||||
if a is not None:
|
||||
url = a['href']
|
||||
if not self.exclude_url(url):
|
||||
if url.startswith('/'):
|
||||
url = self.newyorker_prefix+url
|
||||
if h3.br is not None:
|
||||
h3.br.replaceWith(' ')
|
||||
title = self.tag_to_string(h3)
|
||||
article_list.append(dict(title=title,url=url,date='',description='',author='',content=''))
|
||||
return article_list
|
||||
|
||||
def filter_ans(self, ans) :
|
||||
total_article_count = 0
|
||||
idx = 0
|
||||
idx_max = len(ans)-1
|
||||
while idx <= idx_max:
|
||||
if True: #self.verbose
|
||||
self.log("Section %s: %d articles" % (ans[idx][0], len(ans[idx][1])) )
|
||||
for article in ans[idx][1]:
|
||||
total_article_count += 1
|
||||
if True: #self.verbose
|
||||
self.log("\t%-40.40s... \t%-60.60s..." % (article['title'].encode('cp1252','replace'),
|
||||
article['url'].replace('http://m.newyorker.com','').encode('cp1252','replace')))
|
||||
idx = idx+1
|
||||
self.log( "Queued %d articles" % total_article_count )
|
||||
return ans
|
||||
|
||||
|
||||
def parse_index(self):
|
||||
ans = []
|
||||
try:
|
||||
soup = self.index_to_soup(self.newyorker_prefix)
|
||||
except:
|
||||
return ans
|
||||
seclist = self.load_global_nav(soup)
|
||||
ans.append(('Front Page',self.load_index_page(soup)))
|
||||
for (sectitle,securl) in seclist:
|
||||
ans.append((sectitle,self.load_global_section(securl)))
|
||||
return self.filter_ans(ans)
|
||||
|
||||
|
@ -12,6 +12,7 @@ class AdvancedUserRecipe1306061239(BasicNewsRecipe):
|
||||
max_articles_per_feed = 20
|
||||
#auto_cleanup = True
|
||||
language = 'en_GB'
|
||||
compress_news_images = True
|
||||
|
||||
def get_cover_url(self):
|
||||
soup = self.index_to_soup('http://www.nme.com/component/subscribe')
|
||||
@ -27,7 +28,7 @@ class AdvancedUserRecipe1306061239(BasicNewsRecipe):
|
||||
br.open_novisit(cov2)
|
||||
cover_url = str(cov2)
|
||||
except:
|
||||
cover_url = 'http://tawanda3000.files.wordpress.com/2011/02/nme-logo.jpg'
|
||||
cover_url = 'http://tawanda3000.files.wordpress.com/2011/02/nme-logo.jpg'
|
||||
return cover_url
|
||||
|
||||
masthead_url = 'http://tawanda3000.files.wordpress.com/2011/02/nme-logo.jpg'
|
||||
|
@ -1,30 +1,30 @@
|
||||
# vim:fileencoding=UTF-8
|
||||
from __future__ import unicode_literals
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2011, Eddie Lau'
|
||||
__copyright__ = '2011-2013, Eddie Lau'
|
||||
|
||||
# data source: normal, mobile
|
||||
__Source__ = 'mobile'
|
||||
# please replace the following "True" with "False". (Default: True)
|
||||
__MakePeriodical__ = True
|
||||
# Turn below to True if your device supports display of CJK titles (Default: False)
|
||||
__UseChineseTitle__ = False
|
||||
__UseChineseTitle__ = True
|
||||
# Set it to False if you want to skip images (Default: True)
|
||||
__KeepImages__ = True
|
||||
# Set it to True if you want to include a summary in Kindle's article view (Default: False)
|
||||
__IncludeSummary__ = False
|
||||
__IncludeSummary__ = True
|
||||
# Set it to True if you want thumbnail images in Kindle's article view (Default: True)
|
||||
__IncludeThumbnails__ = True
|
||||
|
||||
|
||||
'''
|
||||
Change Log:
|
||||
2013/03/31 -- fix cover retrieval code and heading size, and remove in summary
|
||||
2011/12/29 -- first version done
|
||||
TODO:
|
||||
* use alternative source at http://m.singtao.com/index.php
|
||||
'''
|
||||
|
||||
from calibre.utils.date import now as nowf
|
||||
import os, datetime, re
|
||||
from datetime import date
|
||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||
from contextlib import nested
|
||||
from calibre.ebooks.BeautifulSoup import BeautifulSoup
|
||||
@ -41,7 +41,7 @@ class STHKRecipe(BasicNewsRecipe):
|
||||
title = 'Sing Tao Daily - Hong Kong'
|
||||
description = 'Hong Kong Chinese Newspaper (http://singtao.com)'
|
||||
category = 'Chinese, News, Hong Kong'
|
||||
extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px; max-height:90%;} td[class=caption] {font-size:50%;} td[class=bodyhead]{font-weight:bold; font-size:150%;} td[class=stmobheadline]{font-weight:bold; font-size:150%;}'
|
||||
extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px; max-height:90%;} td[class=caption] {font-size:50%;} td[class=bodyhead]{font-weight:bold; font-size:150%;} td[class=stmobheadline]{font-weight:bold; font-size:200%;}'
|
||||
masthead_url = 'http://upload.wikimedia.org/wikipedia/en/d/dd/Singtao-usa.png'
|
||||
if __Source__ == 'normal':
|
||||
keep_only_tags = [dict(name='td', attrs={'class':['bodyhead','bodytext']})]
|
||||
@ -96,17 +96,13 @@ class STHKRecipe(BasicNewsRecipe):
|
||||
return self.get_dtlocal().strftime("%d")
|
||||
|
||||
def get_cover_url(self):
|
||||
#cover = 'http://singtao.com/media/a/a(2660).jpg' # for 2011/12/29
|
||||
base = 2660
|
||||
todaydate = date(int(self.get_fetchyear()), int(self.get_fetchmonth()), int(self.get_fetchday()))
|
||||
diff = todaydate - date(2011, 12, 29)
|
||||
base = base + int(diff.total_seconds()/(3600*24))
|
||||
cover = 'http://singtao.com/media/a/a(' + str(base) +').jpg'
|
||||
soup = self.index_to_soup('http://m.singtao.com/')
|
||||
cover = soup.find(attrs={'class':'special'}).get('src', False)
|
||||
br = BasicNewsRecipe.get_browser(self)
|
||||
try:
|
||||
br.open(cover)
|
||||
except:
|
||||
cover = 'http://singtao.com/images/stlogo.gif'
|
||||
cover = None
|
||||
return cover
|
||||
|
||||
def parse_index(self):
|
||||
@ -289,11 +285,11 @@ class STHKRecipe(BasicNewsRecipe):
|
||||
# the text may or may not be enclosed in <p></p> tag
|
||||
paras = articlebody.findAll('p')
|
||||
if not paras:
|
||||
paras = articlebody
|
||||
paras = articlebody
|
||||
textFound = False
|
||||
for p in paras:
|
||||
if not textFound:
|
||||
summary_candidate = self.tag_to_string(p).strip()
|
||||
summary_candidate = self.tag_to_string(p).strip().replace(' ', '')
|
||||
if len(summary_candidate) > 0:
|
||||
summary_candidate = summary_candidate.replace(u'(\u661f\u5cf6\u65e5\u5831\u5831\u9053)', '', 1)
|
||||
article.summary = article.text_summary = summary_candidate
|
||||
@ -489,3 +485,4 @@ class STHKRecipe(BasicNewsRecipe):
|
||||
|
||||
|
||||
|
||||
|
||||
|
@ -2,6 +2,7 @@
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
|
||||
import re
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
from calibre.utils.magick import Image
|
||||
|
||||
|
@ -20,7 +20,7 @@ class AdvancedUserRecipe1325006965(BasicNewsRecipe):
|
||||
no_stylesheets = True
|
||||
|
||||
ignore_duplicate_articles = {'title','url'}
|
||||
|
||||
compress_news_images = True
|
||||
|
||||
extra_css = '''
|
||||
body{ text-align: justify; font-family:Arial,Helvetica,sans-serif; font-size:11px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:normal;}
|
||||
|
@ -36,47 +36,21 @@ class TheOnion(BasicNewsRecipe):
|
||||
, 'publisher': publisher
|
||||
, 'language' : language
|
||||
}
|
||||
|
||||
keep_only_tags = [
|
||||
dict(name='h2', attrs={'class':['section_title','title']})
|
||||
,dict(attrs={'class':['main_image','meta','article_photo_lead','article_body']})
|
||||
,dict(attrs={'id':['entries']})
|
||||
]
|
||||
remove_attributes=['lang','rel']
|
||||
remove_tags_after = dict(attrs={'class':['article_body','feature_content']})
|
||||
keep_only_tags = [dict(name='article', attrs={'class':'full-article'})]
|
||||
remove_tags = [
|
||||
dict(name=['object','link','iframe','base','meta'])
|
||||
,dict(name='div', attrs={'class':['toolbar_side','graphical_feature','toolbar_bottom']})
|
||||
,dict(name='div', attrs={'id':['recent_slider','sidebar','pagination','related_media']})
|
||||
]
|
||||
|
||||
dict(name=['nav', 'aside', 'section', 'meta']),
|
||||
{'attrs':{'class':lambda x: x and ('share-tools' in x or 'ad-zone' in x)}},
|
||||
]
|
||||
|
||||
feeds = [
|
||||
(u'Daily' , u'http://feeds.theonion.com/theonion/daily' )
|
||||
,(u'Sports' , u'http://feeds.theonion.com/theonion/sports' )
|
||||
]
|
||||
|
||||
def get_article_url(self, article):
|
||||
artl = BasicNewsRecipe.get_article_url(self, article)
|
||||
if artl.startswith('http://www.theonion.com/audio/'):
|
||||
artl = None
|
||||
return artl
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
for item in soup.findAll(style=True):
|
||||
del item['style']
|
||||
for item in soup.findAll('a'):
|
||||
limg = item.find('img')
|
||||
if item.string is not None:
|
||||
str = item.string
|
||||
item.replaceWith(str)
|
||||
else:
|
||||
if limg:
|
||||
item.name = 'div'
|
||||
item.attrs = []
|
||||
if not limg.has_key('alt'):
|
||||
limg['alt'] = 'image'
|
||||
else:
|
||||
str = self.tag_to_string(item)
|
||||
item.replaceWith(str)
|
||||
def preprocess_html(self, soup, *args):
|
||||
for img in soup.findAll('img', attrs={'data-src':True}):
|
||||
if img['data-src']:
|
||||
img['src'] = img['data-src']
|
||||
return soup
|
||||
|
||||
|
||||
|
17
recipes/universe_today.recipe
Normal file
17
recipes/universe_today.recipe
Normal file
@ -0,0 +1,17 @@
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class UniverseToday(BasicNewsRecipe):
|
||||
title = u'Universe Today'
|
||||
language = 'en'
|
||||
description = u'Space and astronomy news.'
|
||||
__author__ = 'seird'
|
||||
publisher = u'universetoday.com'
|
||||
category = 'science, astronomy, news, rss'
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 40
|
||||
auto_cleanup = True
|
||||
no_stylesheets = True
|
||||
use_embedded_content = False
|
||||
remove_empty_feeds = True
|
||||
|
||||
feeds = [(u'Universe Today', u'http://feeds.feedburner.com/universetoday/pYdq')]
|
@ -6,17 +6,62 @@ __license__ = 'GPL v3'
|
||||
www.canada.com
|
||||
'''
|
||||
import re
|
||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
from calibre.ebooks.BeautifulSoup import Tag, BeautifulStoneSoup
|
||||
|
||||
|
||||
class TimesColonist(BasicNewsRecipe):
|
||||
|
||||
# Customization -- remove sections you don't want.
|
||||
# If your e-reader is an e-ink Kindle and your output profile is
|
||||
# set properly this recipe will not include images because the
|
||||
# resulting file is too large. If you have one of these and want
|
||||
# images you can set kindle_omit_images = False
|
||||
# and remove sections (typically the e-ink Kindles will
|
||||
# work with about a dozen of these, but your mileage may vary).
|
||||
|
||||
kindle_omit_images = True
|
||||
|
||||
section_list = [
|
||||
('','Web Front Page'),
|
||||
('news/','News Headlines'),
|
||||
('news/b-c/','BC News'),
|
||||
('news/national/','National News'),
|
||||
('news/world/','World News'),
|
||||
('opinion/','Opinion'),
|
||||
('opinion/letters/','Letters'),
|
||||
('business/','Business'),
|
||||
('business/money/','Money'),
|
||||
('business/technology/','Technology'),
|
||||
('business/working/','Working'),
|
||||
('sports/','Sports'),
|
||||
('sports/hockey/','Hockey'),
|
||||
('sports/football/','Football'),
|
||||
('sports/basketball/','Basketball'),
|
||||
('sports/golf/','Golf'),
|
||||
('entertainment/','entertainment'),
|
||||
('entertainment/go/','Go!'),
|
||||
('entertainment/music/','Music'),
|
||||
('entertainment/books/','Books'),
|
||||
('entertainment/Movies/','Movies'),
|
||||
('entertainment/television/','Television'),
|
||||
('life/','Life'),
|
||||
('life/health/','Health'),
|
||||
('life/travel/','Travel'),
|
||||
('life/driving/','Driving'),
|
||||
('life/homes/','Homes'),
|
||||
('life/food-drink/','Food & Drink')
|
||||
]
|
||||
|
||||
title = u'Victoria Times Colonist'
|
||||
url_prefix = 'http://www.timescolonist.com'
|
||||
description = u'News from Victoria, BC'
|
||||
fp_tag = 'CAN_TC'
|
||||
|
||||
masthead_url = 'http://www.timescolonist.com/gmg/img/global/logoTimesColonist.png'
|
||||
|
||||
|
||||
url_list = []
|
||||
language = 'en_CA'
|
||||
__author__ = 'Nick Redding'
|
||||
@ -29,15 +74,21 @@ class TimesColonist(BasicNewsRecipe):
|
||||
.caption { font-size: xx-small; font-style: italic; font-weight: normal; }
|
||||
'''
|
||||
keep_only_tags = [dict(name='div', attrs={'class':re.compile('main.content')})]
|
||||
remove_tags = [{'class':'comments'},
|
||||
{'id':'photocredit'},
|
||||
dict(name='div', attrs={'class':re.compile('top.controls')}),
|
||||
dict(name='div', attrs={'class':re.compile('social')}),
|
||||
dict(name='div', attrs={'class':re.compile('tools')}),
|
||||
dict(name='div', attrs={'class':re.compile('bottom.tools')}),
|
||||
dict(name='div', attrs={'class':re.compile('window')}),
|
||||
dict(name='div', attrs={'class':re.compile('related.news.element')})]
|
||||
|
||||
def __init__(self, options, log, progress_reporter):
|
||||
self.remove_tags = [{'class':'comments'},
|
||||
{'id':'photocredit'},
|
||||
dict(name='div', attrs={'class':re.compile('top.controls')}),
|
||||
dict(name='div', attrs={'class':re.compile('^comments')}),
|
||||
dict(name='div', attrs={'class':re.compile('social')}),
|
||||
dict(name='div', attrs={'class':re.compile('tools')}),
|
||||
dict(name='div', attrs={'class':re.compile('bottom.tools')}),
|
||||
dict(name='div', attrs={'class':re.compile('window')}),
|
||||
dict(name='div', attrs={'class':re.compile('related.news.element')})]
|
||||
print("PROFILE NAME = "+options.output_profile.short_name)
|
||||
if self.kindle_omit_images and options.output_profile.short_name in ['kindle', 'kindle_dx', 'kindle_pw']:
|
||||
self.remove_tags.append(dict(name='div', attrs={'class':re.compile('image-container')}))
|
||||
BasicNewsRecipe.__init__(self, options, log, progress_reporter)
|
||||
|
||||
def get_cover_url(self):
|
||||
from datetime import timedelta, date
|
||||
@ -122,7 +173,6 @@ class TimesColonist(BasicNewsRecipe):
|
||||
def preprocess_html(self,soup):
|
||||
byline = soup.find('p',attrs={'class':re.compile('ancillary')})
|
||||
if byline is not None:
|
||||
byline.find('a')
|
||||
authstr = self.tag_to_string(byline,False)
|
||||
authstr = re.sub('/ *Times Colonist','/',authstr, flags=re.IGNORECASE)
|
||||
authstr = re.sub('BY */','',authstr, flags=re.IGNORECASE)
|
||||
@ -149,9 +199,10 @@ class TimesColonist(BasicNewsRecipe):
|
||||
atag = htag.a
|
||||
if atag is not None:
|
||||
url = atag['href']
|
||||
#print("Checking "+url)
|
||||
if atag['href'].startswith('/'):
|
||||
url = self.url_prefix+atag['href']
|
||||
url = url.strip()
|
||||
# print("Checking >>"+url+'<<\n\r')
|
||||
if url.startswith('/'):
|
||||
url = self.url_prefix+url
|
||||
if url in self.url_list:
|
||||
return
|
||||
self.url_list.append(url)
|
||||
@ -171,10 +222,10 @@ class TimesColonist(BasicNewsRecipe):
|
||||
if dtag is not None:
|
||||
description = self.tag_to_string(dtag,False)
|
||||
article_list.append(dict(title=title,url=url,date='',description=description,author='',content=''))
|
||||
#print(sectitle+title+": description = "+description+" URL="+url)
|
||||
print(sectitle+title+": description = "+description+" URL="+url+'\n\r')
|
||||
|
||||
def add_section_index(self,ans,securl,sectitle):
|
||||
print("Add section url="+self.url_prefix+'/'+securl)
|
||||
print("Add section url="+self.url_prefix+'/'+securl+'\n\r')
|
||||
try:
|
||||
soup = self.index_to_soup(self.url_prefix+'/'+securl)
|
||||
except:
|
||||
@ -193,33 +244,7 @@ class TimesColonist(BasicNewsRecipe):
|
||||
|
||||
def parse_index(self):
|
||||
ans = []
|
||||
ans = self.add_section_index(ans,'','Web Front Page')
|
||||
ans = self.add_section_index(ans,'news/','News Headlines')
|
||||
ans = self.add_section_index(ans,'news/b-c/','BC News')
|
||||
ans = self.add_section_index(ans,'news/national/','Natioanl News')
|
||||
ans = self.add_section_index(ans,'news/world/','World News')
|
||||
ans = self.add_section_index(ans,'opinion/','Opinion')
|
||||
ans = self.add_section_index(ans,'opinion/letters/','Letters')
|
||||
ans = self.add_section_index(ans,'business/','Business')
|
||||
ans = self.add_section_index(ans,'business/money/','Money')
|
||||
ans = self.add_section_index(ans,'business/technology/','Technology')
|
||||
ans = self.add_section_index(ans,'business/working/','Working')
|
||||
ans = self.add_section_index(ans,'sports/','Sports')
|
||||
ans = self.add_section_index(ans,'sports/hockey/','Hockey')
|
||||
ans = self.add_section_index(ans,'sports/football/','Football')
|
||||
ans = self.add_section_index(ans,'sports/basketball/','Basketball')
|
||||
ans = self.add_section_index(ans,'sports/golf/','Golf')
|
||||
ans = self.add_section_index(ans,'entertainment/','entertainment')
|
||||
ans = self.add_section_index(ans,'entertainment/go/','Go!')
|
||||
ans = self.add_section_index(ans,'entertainment/music/','Music')
|
||||
ans = self.add_section_index(ans,'entertainment/books/','Books')
|
||||
ans = self.add_section_index(ans,'entertainment/Movies/','movies')
|
||||
ans = self.add_section_index(ans,'entertainment/television/','Television')
|
||||
ans = self.add_section_index(ans,'life/','Life')
|
||||
ans = self.add_section_index(ans,'life/health/','Health')
|
||||
ans = self.add_section_index(ans,'life/travel/','Travel')
|
||||
ans = self.add_section_index(ans,'life/driving/','Driving')
|
||||
ans = self.add_section_index(ans,'life/homes/','Homes')
|
||||
ans = self.add_section_index(ans,'life/food-drink/','Food & Drink')
|
||||
for (url,title) in self.section_list:
|
||||
ans = self.add_section_index(ans,url,title)
|
||||
return ans
|
||||
|
||||
|
@ -1,6 +1,5 @@
|
||||
#!/usr/bin/env python
|
||||
__license__ = 'GPL v3'
|
||||
import re
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
@ -51,8 +50,8 @@ class WysokieObcasyRecipe(BasicNewsRecipe):
|
||||
printVerString=articleURL1 + ',' + articleURL2
|
||||
s= baseURL + subPath + printVerString + '.html'
|
||||
return s
|
||||
|
||||
|
||||
def get_cover_url(self):
|
||||
soup = self.index_to_soup('http://www.wysokieobcasy.pl/wysokie-obcasy/0,0.html')
|
||||
self.cover_url = soup.find(attrs={'class':'holder_cr'}).find('img')['src']
|
||||
return getattr(self, 'cover_url', self.cover_url)
|
||||
return getattr(self, 'cover_url', self.cover_url)
|
||||
|
@ -357,7 +357,7 @@
|
||||
<xsl:apply-templates/>
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template match="rtf:table">
|
||||
<xsl:template match="rtf:table">
|
||||
<xsl:element name="table">
|
||||
<xsl:attribute name="id">
|
||||
<xsl:value-of select="generate-id(.)"/>
|
||||
@ -390,7 +390,6 @@
|
||||
|
||||
|
||||
<xsl:output method = "xml"/>
|
||||
|
||||
<xsl:key name="style-types" match="rtf:paragraph-definition" use="@style-number"/>
|
||||
|
||||
|
||||
@ -415,13 +414,11 @@
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template match="rtf:page-break">
|
||||
<xsl:element name="br">
|
||||
<xsl:attribute name="style">page-break-after:always</xsl:attribute>
|
||||
</xsl:element>
|
||||
<br style = "page-break-after:always"/>
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template match="rtf:hardline-break">
|
||||
<xsl:element name="br"/>
|
||||
<br/>
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template match="rtf:rtf-definition|rtf:font-table|rtf:color-table|rtf:style-table|rtf:page-definition|rtf:list-table|rtf:override-table|rtf:override-list|rtf:list-text"/>
|
||||
@ -445,7 +442,7 @@
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template match = "rtf:field-block">
|
||||
<xsl:apply-templates/>
|
||||
<xsl:apply-templates/>
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template match = "rtf:field[@type='hyperlink']">
|
||||
@ -472,9 +469,7 @@
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template match="rtf:pict">
|
||||
<xsl:element name="img">
|
||||
<xsl:attribute name="src"><xsl:value-of select="@num" /></xsl:attribute>
|
||||
</xsl:element>
|
||||
<img src = "{@num}"/>
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template match="*">
|
||||
|
@ -47,6 +47,10 @@ binary_includes = [
|
||||
'/usr/lib/libgthread-2.0.so.0',
|
||||
'/usr/lib/libpng14.so.14',
|
||||
'/usr/lib/libexslt.so.0',
|
||||
# Ensure that libimobiledevice is compiled against openssl, not gnutls
|
||||
'/usr/lib/libimobiledevice.so.3',
|
||||
'/usr/lib/libusbmuxd.so.2',
|
||||
'/usr/lib/libplist.so.1',
|
||||
MAGICK_PREFIX+'/lib/libMagickWand.so.5',
|
||||
MAGICK_PREFIX+'/lib/libMagickCore.so.5',
|
||||
'/usr/lib/libgcrypt.so.11',
|
||||
|
@ -399,7 +399,8 @@ class Py2App(object):
|
||||
@flush
|
||||
def add_fontconfig(self):
|
||||
info('\nAdding fontconfig')
|
||||
for x in ('fontconfig.1', 'freetype.6', 'expat.1'):
|
||||
for x in ('fontconfig.1', 'freetype.6', 'expat.1',
|
||||
'plist.1', 'usbmuxd.2', 'imobiledevice.3'):
|
||||
src = os.path.join(SW, 'lib', 'lib'+x+'.dylib')
|
||||
self.install_dylib(src)
|
||||
dst = os.path.join(self.resources_dir, 'fonts')
|
||||
|
@ -12,13 +12,13 @@ msgstr ""
|
||||
"Report-Msgid-Bugs-To: Debian iso-codes team <pkg-isocodes-"
|
||||
"devel@lists.alioth.debian.org>\n"
|
||||
"POT-Creation-Date: 2011-11-25 14:01+0000\n"
|
||||
"PO-Revision-Date: 2013-03-27 13:07+0000\n"
|
||||
"PO-Revision-Date: 2013-03-28 13:01+0000\n"
|
||||
"Last-Translator: Ferran Rius <frius64@hotmail.com>\n"
|
||||
"Language-Team: Catalan <linux@softcatala.org>\n"
|
||||
"MIME-Version: 1.0\n"
|
||||
"Content-Type: text/plain; charset=UTF-8\n"
|
||||
"Content-Transfer-Encoding: 8bit\n"
|
||||
"X-Launchpad-Export-Date: 2013-03-28 04:41+0000\n"
|
||||
"X-Launchpad-Export-Date: 2013-03-29 04:36+0000\n"
|
||||
"X-Generator: Launchpad (build 16546)\n"
|
||||
"Language: ca\n"
|
||||
|
||||
@ -1884,7 +1884,7 @@ msgstr "Awera"
|
||||
|
||||
#. name for aws
|
||||
msgid "Awyu; South"
|
||||
msgstr "Awyu meridional"
|
||||
msgstr "Awyu; meridional"
|
||||
|
||||
#. name for awt
|
||||
msgid "Araweté"
|
||||
@ -1892,7 +1892,7 @@ msgstr "Araweté"
|
||||
|
||||
#. name for awu
|
||||
msgid "Awyu; Central"
|
||||
msgstr "Awyu central"
|
||||
msgstr "Awyu; Central"
|
||||
|
||||
#. name for awv
|
||||
msgid "Awyu; Jair"
|
||||
@ -4052,7 +4052,7 @@ msgstr "Buginès"
|
||||
|
||||
#. name for buh
|
||||
msgid "Bunu; Younuo"
|
||||
msgstr "Bunu; Younuo"
|
||||
msgstr "Bunu; Younou"
|
||||
|
||||
#. name for bui
|
||||
msgid "Bongili"
|
||||
@ -4308,7 +4308,7 @@ msgstr "Bwa"
|
||||
|
||||
#. name for bwx
|
||||
msgid "Bunu; Bu-Nao"
|
||||
msgstr "Bunu; Bu-Nao"
|
||||
msgstr "Bunu; Bu Nao"
|
||||
|
||||
#. name for bwy
|
||||
msgid "Bwamu; Cwi"
|
||||
@ -19804,7 +19804,7 @@ msgstr "Minoà"
|
||||
|
||||
#. name for omo
|
||||
msgid "Utarmbung"
|
||||
msgstr ""
|
||||
msgstr "Utarmbung"
|
||||
|
||||
#. name for omp
|
||||
msgid "Manipuri; Old"
|
||||
@ -20344,7 +20344,7 @@ msgstr "Pear"
|
||||
|
||||
#. name for pcc
|
||||
msgid "Bouyei"
|
||||
msgstr ""
|
||||
msgstr "Buyí"
|
||||
|
||||
#. name for pcd
|
||||
msgid "Picard"
|
||||
@ -20456,11 +20456,11 @@ msgstr "Pengo"
|
||||
|
||||
#. name for peh
|
||||
msgid "Bonan"
|
||||
msgstr ""
|
||||
msgstr "Bonan"
|
||||
|
||||
#. name for pei
|
||||
msgid "Chichimeca-Jonaz"
|
||||
msgstr ""
|
||||
msgstr "Chichimec"
|
||||
|
||||
#. name for pej
|
||||
msgid "Pomo; Northern"
|
||||
@ -20484,7 +20484,7 @@ msgstr "Persa Antic"
|
||||
|
||||
#. name for pep
|
||||
msgid "Kunja"
|
||||
msgstr ""
|
||||
msgstr "Kunja"
|
||||
|
||||
#. name for peq
|
||||
msgid "Pomo; Southern"
|
||||
@ -20536,7 +20536,7 @@ msgstr "Pagi"
|
||||
|
||||
#. name for pgk
|
||||
msgid "Rerep"
|
||||
msgstr ""
|
||||
msgstr "Rerep"
|
||||
|
||||
#. name for pgl
|
||||
msgid "Irish; Primitive"
|
||||
@ -20624,7 +20624,7 @@ msgstr "Pima Baix"
|
||||
|
||||
#. name for pib
|
||||
msgid "Yine"
|
||||
msgstr ""
|
||||
msgstr "Yine"
|
||||
|
||||
#. name for pic
|
||||
msgid "Pinji"
|
||||
@ -20660,7 +20660,7 @@ msgstr "Pijao"
|
||||
|
||||
#. name for pil
|
||||
msgid "Yom"
|
||||
msgstr ""
|
||||
msgstr "Yom"
|
||||
|
||||
#. name for pim
|
||||
msgid "Powhatan"
|
||||
@ -20760,7 +20760,7 @@ msgstr "Llenguatge de signes pakistaní"
|
||||
|
||||
#. name for pkt
|
||||
msgid "Maleng"
|
||||
msgstr ""
|
||||
msgstr "Maleng"
|
||||
|
||||
#. name for pku
|
||||
msgid "Paku"
|
||||
@ -20768,7 +20768,7 @@ msgstr "Paku"
|
||||
|
||||
#. name for pla
|
||||
msgid "Miani"
|
||||
msgstr ""
|
||||
msgstr "Miani"
|
||||
|
||||
#. name for plb
|
||||
msgid "Polonombauk"
|
||||
@ -20804,7 +20804,7 @@ msgstr "Polci"
|
||||
|
||||
#. name for plk
|
||||
msgid "Shina; Kohistani"
|
||||
msgstr ""
|
||||
msgstr "Shina; Kohistani"
|
||||
|
||||
#. name for pll
|
||||
msgid "Palaung; Shwe"
|
||||
@ -20852,7 +20852,7 @@ msgstr "Palawà; Brooke"
|
||||
|
||||
#. name for ply
|
||||
msgid "Bolyu"
|
||||
msgstr ""
|
||||
msgstr "Bolyu"
|
||||
|
||||
#. name for plz
|
||||
msgid "Paluan"
|
||||
@ -20896,7 +20896,7 @@ msgstr "Algonquí Carolina"
|
||||
|
||||
#. name for pml
|
||||
msgid "Lingua Franca"
|
||||
msgstr ""
|
||||
msgstr "Aljamia"
|
||||
|
||||
#. name for pmm
|
||||
msgid "Pomo"
|
||||
@ -20924,7 +20924,7 @@ msgstr "Piemontès"
|
||||
|
||||
#. name for pmt
|
||||
msgid "Tuamotuan"
|
||||
msgstr ""
|
||||
msgstr "Tuamotu"
|
||||
|
||||
#. name for pmu
|
||||
msgid "Panjabi; Mirpur"
|
||||
@ -20972,7 +20972,7 @@ msgstr "Penrhyn"
|
||||
|
||||
#. name for pni
|
||||
msgid "Aoheng"
|
||||
msgstr ""
|
||||
msgstr "Aoheng"
|
||||
|
||||
#. name for pnm
|
||||
msgid "Punan Batu 1"
|
||||
@ -21008,7 +21008,7 @@ msgstr "Pontic"
|
||||
|
||||
#. name for pnu
|
||||
msgid "Bunu; Jiongnai"
|
||||
msgstr ""
|
||||
msgstr "Bunu; Jiongnai"
|
||||
|
||||
#. name for pnv
|
||||
msgid "Pinigura"
|
||||
@ -21100,7 +21100,7 @@ msgstr "Potavatomi"
|
||||
|
||||
#. name for pov
|
||||
msgid "Crioulo; Upper Guinea"
|
||||
msgstr ""
|
||||
msgstr "Crioll guineà"
|
||||
|
||||
#. name for pow
|
||||
msgid "Popoloca; San Felipe Otlaltepec"
|
||||
@ -21128,7 +21128,7 @@ msgstr "Paipai"
|
||||
|
||||
#. name for ppk
|
||||
msgid "Uma"
|
||||
msgstr ""
|
||||
msgstr "Uma"
|
||||
|
||||
#. name for ppl
|
||||
msgid "Pipil"
|
||||
@ -21144,7 +21144,7 @@ msgstr "Papapana"
|
||||
|
||||
#. name for ppo
|
||||
msgid "Folopa"
|
||||
msgstr ""
|
||||
msgstr "Folopa"
|
||||
|
||||
#. name for ppp
|
||||
msgid "Pelende"
|
||||
@ -21180,7 +21180,7 @@ msgstr "Malecite-Passamaquoddy"
|
||||
|
||||
#. name for prb
|
||||
msgid "Lua'"
|
||||
msgstr ""
|
||||
msgstr "Lua"
|
||||
|
||||
#. name for prc
|
||||
msgid "Parachi"
|
||||
@ -21220,7 +21220,7 @@ msgstr "Llenguatge de signes peruà"
|
||||
|
||||
#. name for prm
|
||||
msgid "Kibiri"
|
||||
msgstr ""
|
||||
msgstr "Kibiri"
|
||||
|
||||
#. name for prn
|
||||
msgid "Prasuni"
|
||||
@ -21272,7 +21272,7 @@ msgstr "Llenguatge de signes de Providencia"
|
||||
|
||||
#. name for psa
|
||||
msgid "Awyu; Asue"
|
||||
msgstr ""
|
||||
msgstr "Awyu; Asue"
|
||||
|
||||
#. name for psc
|
||||
msgid "Persian Sign Language"
|
||||
@ -21328,7 +21328,7 @@ msgstr "Llenguatge de signes portuguès"
|
||||
|
||||
#. name for pss
|
||||
msgid "Kaulong"
|
||||
msgstr ""
|
||||
msgstr "Kaulong"
|
||||
|
||||
#. name for pst
|
||||
msgid "Pashto; Central"
|
||||
@ -21376,11 +21376,11 @@ msgstr "Pìamatsina"
|
||||
|
||||
#. name for ptt
|
||||
msgid "Enrekang"
|
||||
msgstr ""
|
||||
msgstr "Enrekang"
|
||||
|
||||
#. name for ptu
|
||||
msgid "Bambam"
|
||||
msgstr ""
|
||||
msgstr "Bambam"
|
||||
|
||||
#. name for ptv
|
||||
msgid "Port Vato"
|
||||
@ -29584,7 +29584,7 @@ msgstr ""
|
||||
|
||||
#. name for yir
|
||||
msgid "Awyu; North"
|
||||
msgstr ""
|
||||
msgstr "Awyu; Septentrional"
|
||||
|
||||
#. name for yis
|
||||
msgid "Yis"
|
||||
|
@ -4,7 +4,7 @@ __license__ = 'GPL v3'
|
||||
__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
__appname__ = u'calibre'
|
||||
numeric_version = (0, 9, 25)
|
||||
numeric_version = (0, 9, 26)
|
||||
__version__ = u'.'.join(map(unicode, numeric_version))
|
||||
__author__ = u"Kovid Goyal <kovid@kovidgoyal.net>"
|
||||
|
||||
|
@ -757,9 +757,10 @@ from calibre.ebooks.metadata.sources.isbndb import ISBNDB
|
||||
from calibre.ebooks.metadata.sources.overdrive import OverDrive
|
||||
from calibre.ebooks.metadata.sources.douban import Douban
|
||||
from calibre.ebooks.metadata.sources.ozon import Ozon
|
||||
# from calibre.ebooks.metadata.sources.google_images import GoogleImages
|
||||
from calibre.ebooks.metadata.sources.google_images import GoogleImages
|
||||
from calibre.ebooks.metadata.sources.big_book_search import BigBookSearch
|
||||
|
||||
plugins += [GoogleBooks, Amazon, Edelweiss, OpenLibrary, ISBNDB, OverDrive, Douban, Ozon]
|
||||
plugins += [GoogleBooks, GoogleImages, Amazon, Edelweiss, OpenLibrary, ISBNDB, OverDrive, Douban, Ozon, BigBookSearch]
|
||||
|
||||
# }}}
|
||||
|
||||
|
@ -91,7 +91,7 @@ def restore_plugin_state_to_default(plugin_or_name):
|
||||
config['enabled_plugins'] = ep
|
||||
|
||||
default_disabled_plugins = set([
|
||||
'Overdrive', 'Douban Books', 'OZON.ru', 'Edelweiss', 'Google Images',
|
||||
'Overdrive', 'Douban Books', 'OZON.ru', 'Edelweiss', 'Google Images', 'Big Book Search',
|
||||
])
|
||||
|
||||
def is_disabled(plugin):
|
||||
|
@ -97,6 +97,12 @@ class TXTInput(InputFormatPlugin):
|
||||
if not ienc:
|
||||
ienc = 'utf-8'
|
||||
log.debug('No input encoding specified and could not auto detect using %s' % ienc)
|
||||
# Remove BOM from start of txt as its presence can confuse markdown
|
||||
import codecs
|
||||
for bom in (codecs.BOM_UTF16_LE, codecs.BOM_UTF16_BE, codecs.BOM_UTF8, codecs.BOM_UTF32_LE, codecs.BOM_UTF32_BE):
|
||||
if txt.startswith(bom):
|
||||
txt = txt[len(bom):]
|
||||
break
|
||||
txt = txt.decode(ienc, 'replace')
|
||||
|
||||
# Replace entities
|
||||
|
@ -132,7 +132,7 @@ class Worker(Thread): # Get details {{{
|
||||
text()="Détails sur le produit" or \
|
||||
text()="Detalles del producto" or \
|
||||
text()="Detalhes do produto" or \
|
||||
text()="登録情報"]/../div[@class="content"]
|
||||
starts-with(text(), "登録情報")]/../div[@class="content"]
|
||||
'''
|
||||
# Editor: is for Spanish
|
||||
self.publisher_xpath = '''
|
||||
@ -235,6 +235,12 @@ class Worker(Thread): # Get details {{{
|
||||
msg = 'Failed to parse amazon details page: %r'%self.url
|
||||
self.log.exception(msg)
|
||||
return
|
||||
if self.domain == 'jp':
|
||||
for a in root.xpath('//a[@href]'):
|
||||
if 'black-curtain-redirect.html' in a.get('href'):
|
||||
self.url = 'http://amazon.co.jp'+a.get('href')
|
||||
self.log('Black curtain redirect found, following')
|
||||
return self.get_details()
|
||||
|
||||
errmsg = root.xpath('//*[@id="errorMessage"]')
|
||||
if errmsg:
|
||||
@ -252,8 +258,8 @@ class Worker(Thread): # Get details {{{
|
||||
self.log.exception('Error parsing asin for url: %r'%self.url)
|
||||
asin = None
|
||||
if self.testing:
|
||||
import tempfile
|
||||
with tempfile.NamedTemporaryFile(prefix=asin + '_',
|
||||
import tempfile, uuid
|
||||
with tempfile.NamedTemporaryFile(prefix=(asin or str(uuid.uuid4()))+ '_',
|
||||
suffix='.html', delete=False) as f:
|
||||
f.write(raw)
|
||||
print ('Downloaded html for', asin, 'saved in', f.name)
|
||||
@ -499,7 +505,7 @@ class Worker(Thread): # Get details {{{
|
||||
def parse_language(self, pd):
|
||||
for x in reversed(pd.xpath(self.language_xpath)):
|
||||
if x.tail:
|
||||
raw = x.tail.strip()
|
||||
raw = x.tail.strip().partition(',')[0].strip()
|
||||
ans = self.lang_map.get(raw, None)
|
||||
if ans:
|
||||
return ans
|
||||
@ -1004,6 +1010,11 @@ if __name__ == '__main__': # tests {{{
|
||||
] # }}}
|
||||
|
||||
jp_tests = [ # {{{
|
||||
( # Adult filtering test
|
||||
{'identifiers':{'isbn':'4799500066'}},
|
||||
[title_test(u'Bitch Trap'),]
|
||||
),
|
||||
|
||||
( # isbn -> title, authors
|
||||
{'identifiers':{'isbn': '9784101302720' }},
|
||||
[title_test(u'精霊の守り人',
|
||||
|
@ -31,7 +31,7 @@ msprefs.defaults['find_first_edition_date'] = False
|
||||
# Google covers are often poor quality (scans/errors) but they have high
|
||||
# resolution, so they trump covers from better sources. So make sure they
|
||||
# are only used if no other covers are found.
|
||||
msprefs.defaults['cover_priorities'] = {'Google':2, 'Google Images':2}
|
||||
msprefs.defaults['cover_priorities'] = {'Google':2, 'Google Images':2, 'Big Book Search':2}
|
||||
|
||||
def create_log(ostream=None):
|
||||
from calibre.utils.logging import ThreadSafeLog, FileStream
|
||||
@ -429,6 +429,40 @@ class Source(Plugin):
|
||||
mi.tags = list(map(fixcase, mi.tags))
|
||||
mi.isbn = check_isbn(mi.isbn)
|
||||
|
||||
def download_multiple_covers(self, title, authors, urls, get_best_cover, timeout, result_queue, abort, log, prefs_name='max_covers'):
|
||||
if not urls:
|
||||
log('No images found for, title: %r and authors: %r'%(title, authors))
|
||||
return
|
||||
from threading import Thread
|
||||
import time
|
||||
if prefs_name:
|
||||
urls = urls[:self.prefs[prefs_name]]
|
||||
if get_best_cover:
|
||||
urls = urls[:1]
|
||||
log('Downloading %d covers'%len(urls))
|
||||
workers = [Thread(target=self.download_image, args=(u, timeout, log, result_queue)) for u in urls]
|
||||
for w in workers:
|
||||
w.daemon = True
|
||||
w.start()
|
||||
alive = True
|
||||
start_time = time.time()
|
||||
while alive and not abort.is_set() and time.time() - start_time < timeout:
|
||||
alive = False
|
||||
for w in workers:
|
||||
if w.is_alive():
|
||||
alive = True
|
||||
break
|
||||
abort.wait(0.1)
|
||||
|
||||
def download_image(self, url, timeout, log, result_queue):
|
||||
try:
|
||||
ans = self.browser.open_novisit(url, timeout=timeout).read()
|
||||
result_queue.put((self, ans))
|
||||
log('Downloaded cover from: %s'%url)
|
||||
except Exception:
|
||||
self.log.exception('Failed to download cover from: %r'%url)
|
||||
|
||||
|
||||
# }}}
|
||||
|
||||
# Metadata API {{{
|
||||
|
58
src/calibre/ebooks/metadata/sources/big_book_search.py
Normal file
58
src/calibre/ebooks/metadata/sources/big_book_search.py
Normal file
@ -0,0 +1,58 @@
|
||||
#!/usr/bin/env python
|
||||
# vim:fileencoding=UTF-8
|
||||
from __future__ import (unicode_literals, division, absolute_import,
|
||||
print_function)
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2013, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
from calibre.ebooks.metadata.sources.base import Source, Option
|
||||
|
||||
def get_urls(br, tokens):
|
||||
from urllib import quote_plus
|
||||
from mechanize import Request
|
||||
from lxml import html
|
||||
escaped = [quote_plus(x.encode('utf-8')) for x in tokens if x and x.strip()]
|
||||
q = b'+'.join(escaped)
|
||||
url = 'http://bigbooksearch.com/books/'+q
|
||||
br.open(url).read()
|
||||
req = Request('http://bigbooksearch.com/query.php?SearchIndex=books&Keywords=%s&ItemPage=1'%q)
|
||||
req.add_header('X-Requested-With', 'XMLHttpRequest')
|
||||
req.add_header('Referer', url)
|
||||
raw = br.open(req).read()
|
||||
root = html.fromstring(raw.decode('utf-8'))
|
||||
urls = [i.get('src') for i in root.xpath('//img[@src]')]
|
||||
return urls
|
||||
|
||||
class BigBookSearch(Source):
|
||||
|
||||
name = 'Big Book Search'
|
||||
description = _('Downloads multiple book covers from Amazon. Useful to find alternate covers.')
|
||||
capabilities = frozenset(['cover'])
|
||||
config_help_message = _('Configure the Big Book Search plugin')
|
||||
can_get_multiple_covers = True
|
||||
options = (Option('max_covers', 'number', 5, _('Maximum number of covers to get'),
|
||||
_('The maximum number of covers to process from the search result')),
|
||||
)
|
||||
supports_gzip_transfer_encoding = True
|
||||
|
||||
def download_cover(self, log, result_queue, abort,
|
||||
title=None, authors=None, identifiers={}, timeout=30, get_best_cover=False):
|
||||
if not title:
|
||||
return
|
||||
br = self.browser
|
||||
tokens = tuple(self.get_title_tokens(title)) + tuple(self.get_author_tokens(authors))
|
||||
urls = get_urls(br, tokens)
|
||||
self.download_multiple_covers(title, authors, urls, get_best_cover, timeout, result_queue, abort, log)
|
||||
|
||||
def test():
|
||||
from calibre import browser
|
||||
import pprint
|
||||
br = browser()
|
||||
urls = get_urls(br, ['consider', 'phlebas', 'banks'])
|
||||
pprint.pprint(urls)
|
||||
|
||||
if __name__ == '__main__':
|
||||
test()
|
||||
|
@ -18,12 +18,13 @@ from calibre.utils.magick.draw import Image, save_cover_data_to
|
||||
|
||||
class Worker(Thread):
|
||||
|
||||
def __init__(self, plugin, abort, title, authors, identifiers, timeout, rq):
|
||||
def __init__(self, plugin, abort, title, authors, identifiers, timeout, rq, get_best_cover=False):
|
||||
Thread.__init__(self)
|
||||
self.daemon = True
|
||||
|
||||
self.plugin = plugin
|
||||
self.abort = abort
|
||||
self.get_best_cover = get_best_cover
|
||||
self.buf = BytesIO()
|
||||
self.log = create_log(self.buf)
|
||||
self.title, self.authors, self.identifiers = (title, authors,
|
||||
@ -37,7 +38,7 @@ class Worker(Thread):
|
||||
try:
|
||||
if self.plugin.can_get_multiple_covers:
|
||||
self.plugin.download_cover(self.log, self.rq, self.abort,
|
||||
title=self.title, authors=self.authors, get_best_cover=True,
|
||||
title=self.title, authors=self.authors, get_best_cover=self.get_best_cover,
|
||||
identifiers=self.identifiers, timeout=self.timeout)
|
||||
else:
|
||||
self.plugin.download_cover(self.log, self.rq, self.abort,
|
||||
@ -72,7 +73,7 @@ def process_result(log, result):
|
||||
return (plugin, width, height, fmt, data)
|
||||
|
||||
def run_download(log, results, abort,
|
||||
title=None, authors=None, identifiers={}, timeout=30):
|
||||
title=None, authors=None, identifiers={}, timeout=30, get_best_cover=False):
|
||||
'''
|
||||
Run the cover download, putting results into the queue :param:`results`.
|
||||
|
||||
@ -89,7 +90,7 @@ def run_download(log, results, abort,
|
||||
plugins = [p for p in metadata_plugins(['cover']) if p.is_configured()]
|
||||
|
||||
rq = Queue()
|
||||
workers = [Worker(p, abort, title, authors, identifiers, timeout, rq) for p
|
||||
workers = [Worker(p, abort, title, authors, identifiers, timeout, rq, get_best_cover=get_best_cover) for p
|
||||
in plugins]
|
||||
for w in workers:
|
||||
w.start()
|
||||
@ -163,7 +164,7 @@ def download_cover(log,
|
||||
abort = Event()
|
||||
|
||||
run_download(log, rq, abort, title=title, authors=authors,
|
||||
identifiers=identifiers, timeout=timeout)
|
||||
identifiers=identifiers, timeout=timeout, get_best_cover=True)
|
||||
|
||||
results = []
|
||||
|
||||
|
@ -106,6 +106,8 @@ class Worker(Thread): # {{{
|
||||
parts = pub.partition(':')[0::2]
|
||||
pub = parts[1] or parts[0]
|
||||
try:
|
||||
if ', Ship Date:' in pub:
|
||||
pub = pub.partition(', Ship Date:')[0]
|
||||
q = parse_only_date(pub, assume_utc=True)
|
||||
if q.year != UNDEFINED_DATE:
|
||||
mi.pubdate = q
|
||||
|
@ -39,39 +39,11 @@ class GoogleImages(Source):
|
||||
title=None, authors=None, identifiers={}, timeout=30, get_best_cover=False):
|
||||
if not title:
|
||||
return
|
||||
from threading import Thread
|
||||
import time
|
||||
timeout = max(60, timeout) # Needs at least a minute
|
||||
title = ' '.join(self.get_title_tokens(title))
|
||||
author = ' '.join(self.get_author_tokens(authors))
|
||||
urls = self.get_image_urls(title, author, log, abort, timeout)
|
||||
if not urls:
|
||||
log('No images found in Google for, title: %r and authors: %r'%(title, author))
|
||||
return
|
||||
urls = urls[:self.prefs['max_covers']]
|
||||
if get_best_cover:
|
||||
urls = urls[:1]
|
||||
workers = [Thread(target=self.download_image, args=(url, timeout, log, result_queue)) for url in urls]
|
||||
for w in workers:
|
||||
w.daemon = True
|
||||
w.start()
|
||||
alive = True
|
||||
start_time = time.time()
|
||||
while alive and not abort.is_set() and time.time() - start_time < timeout:
|
||||
alive = False
|
||||
for w in workers:
|
||||
if w.is_alive():
|
||||
alive = True
|
||||
break
|
||||
abort.wait(0.1)
|
||||
|
||||
def download_image(self, url, timeout, log, result_queue):
|
||||
try:
|
||||
ans = self.browser.open_novisit(url, timeout=timeout).read()
|
||||
result_queue.put((self, ans))
|
||||
log('Downloaded cover from: %s'%url)
|
||||
except Exception:
|
||||
self.log.exception('Failed to download cover from: %r'%url)
|
||||
self.download_multiple_covers(title, authors, urls, get_best_cover, timeout, result_queue, abort, log)
|
||||
|
||||
def get_image_urls(self, title, author, log, abort, timeout):
|
||||
from calibre.utils.ipc.simple_worker import fork_job, WorkerError
|
||||
|
@ -262,6 +262,35 @@ def from_links(container):
|
||||
toc.remove(child)
|
||||
return toc
|
||||
|
||||
def find_text(node):
|
||||
LIMIT = 200
|
||||
pat = re.compile(r'\s+')
|
||||
for child in node:
|
||||
if isinstance(child, etree._Element):
|
||||
text = xml2text(child).strip()
|
||||
text = pat.sub(' ', text)
|
||||
if len(text) < 1:
|
||||
continue
|
||||
if len(text) > LIMIT:
|
||||
# Look for less text in a child of this node, recursively
|
||||
ntext = find_text(child)
|
||||
return ntext or (text[:LIMIT] + '...')
|
||||
else:
|
||||
return text
|
||||
|
||||
def from_files(container):
|
||||
toc = TOC()
|
||||
for spinepath in container.spine_items:
|
||||
name = container.abspath_to_name(spinepath)
|
||||
root = container.parsed(name)
|
||||
body = XPath('//h:body')(root)
|
||||
if not body:
|
||||
continue
|
||||
text = find_text(body[0])
|
||||
if text:
|
||||
toc.add(text, name)
|
||||
return toc
|
||||
|
||||
def add_id(container, name, loc):
|
||||
root = container.parsed(name)
|
||||
body = root.xpath('//*[local-name()="body"]')[0]
|
||||
|
@ -333,8 +333,8 @@ class OEBReader(object):
|
||||
guide = self.oeb.guide
|
||||
manifest = self.oeb.manifest
|
||||
for elem in xpath(opf, '/o2:package/o2:guide/o2:reference'):
|
||||
href = elem.get('href')
|
||||
path = urlnormalize(urldefrag(href)[0])
|
||||
ref_href = elem.get('href')
|
||||
path = urlnormalize(urldefrag(ref_href)[0])
|
||||
if path not in manifest.hrefs:
|
||||
corrected_href = None
|
||||
for href in manifest.hrefs:
|
||||
@ -342,12 +342,12 @@ class OEBReader(object):
|
||||
corrected_href = href
|
||||
break
|
||||
if corrected_href is None:
|
||||
self.logger.warn(u'Guide reference %r not found' % href)
|
||||
self.logger.warn(u'Guide reference %r not found' % ref_href)
|
||||
continue
|
||||
href = corrected_href
|
||||
ref_href = corrected_href
|
||||
typ = elem.get('type')
|
||||
if typ not in guide:
|
||||
guide.add(typ, elem.get('title'), href)
|
||||
guide.add(typ, elem.get('title'), ref_href)
|
||||
|
||||
def _find_ncx(self, opf):
|
||||
result = xpath(opf, '/o2:package/o2:spine/@toc')
|
||||
|
@ -180,5 +180,6 @@ class BorderParse:
|
||||
elif 'single' in border_style_list:
|
||||
new_border_dict[att] = 'single'
|
||||
else:
|
||||
new_border_dict[att] = border_style_list[0]
|
||||
if border_style_list:
|
||||
new_border_dict[att] = border_style_list[0]
|
||||
return new_border_dict
|
||||
|
@ -88,9 +88,7 @@ class StoreAction(InterfaceAction):
|
||||
if row == None:
|
||||
error_dialog(self.gui, _('Cannot search'), _('No book selected'), show=True)
|
||||
return
|
||||
|
||||
query = 'author:"%s"' % self._get_author(row)
|
||||
self.search(query)
|
||||
self.search({ 'author': self._get_author(row) })
|
||||
|
||||
def _get_title(self, row):
|
||||
title = ''
|
||||
@ -107,18 +105,14 @@ class StoreAction(InterfaceAction):
|
||||
if row == None:
|
||||
error_dialog(self.gui, _('Cannot search'), _('No book selected'), show=True)
|
||||
return
|
||||
|
||||
query = 'title:"%s"' % self._get_title(row)
|
||||
self.search(query)
|
||||
self.search({ 'title': self._get_title(row) })
|
||||
|
||||
def search_author_title(self):
|
||||
row = self._get_selected_row()
|
||||
if row == None:
|
||||
error_dialog(self.gui, _('Cannot search'), _('No book selected'), show=True)
|
||||
return
|
||||
|
||||
query = 'author:"%s" title:"%s"' % (self._get_author(row), self._get_title(row))
|
||||
self.search(query)
|
||||
self.search({ 'author': self._get_author(row), 'title': self._get_title(row) })
|
||||
|
||||
def choose(self):
|
||||
from calibre.gui2.store.config.chooser.chooser_dialog import StoreChooserDialog
|
||||
|
@ -62,16 +62,20 @@ class SearchDialog(QDialog, Ui_Dialog):
|
||||
self.setup_store_checks()
|
||||
|
||||
# Set the search query
|
||||
if isinstance(query, (str, unicode)):
|
||||
self.search_edit.setText(query)
|
||||
elif isinstance(query, dict):
|
||||
if 'author' in query:
|
||||
self.search_author.setText(query['author'])
|
||||
if 'title' in query:
|
||||
self.search_title.setText(query['title'])
|
||||
# Title
|
||||
self.search_title.setText(query)
|
||||
self.search_title.setSizeAdjustPolicy(QComboBox.AdjustToMinimumContentsLengthWithIcon)
|
||||
self.search_title.setMinimumContentsLength(25)
|
||||
# Author
|
||||
self.search_author.setText(query)
|
||||
self.search_author.setSizeAdjustPolicy(QComboBox.AdjustToMinimumContentsLengthWithIcon)
|
||||
self.search_author.setMinimumContentsLength(25)
|
||||
# Keyword
|
||||
self.search_edit.setText(query)
|
||||
self.search_edit.setSizeAdjustPolicy(QComboBox.AdjustToMinimumContentsLengthWithIcon)
|
||||
self.search_edit.setMinimumContentsLength(25)
|
||||
|
||||
@ -408,7 +412,7 @@ class SearchDialog(QDialog, Ui_Dialog):
|
||||
self.save_state()
|
||||
|
||||
def exec_(self):
|
||||
if unicode(self.search_edit.text()).strip():
|
||||
if unicode(self.search_edit.text()).strip() or unicode(self.search_title.text()).strip() or unicode(self.search_author.text()).strip():
|
||||
self.do_search()
|
||||
return QDialog.exec_(self)
|
||||
|
||||
|
@ -1,7 +1,7 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from __future__ import (unicode_literals, division, absolute_import, print_function)
|
||||
store_version = 2 # Needed for dynamic plugin loading
|
||||
store_version = 3 # Needed for dynamic plugin loading
|
||||
|
||||
__license__ = 'GPL 3'
|
||||
__copyright__ = '2011-2013, Tomasz Długosz <tomek3d@gmail.com>'
|
||||
@ -67,7 +67,7 @@ class NextoStore(BasicStoreConfig, StorePlugin):
|
||||
|
||||
cover_url = ''.join(data.xpath('.//img[@class="cover"]/@src'))
|
||||
cover_url = re.sub(r'%2F', '/', cover_url)
|
||||
cover_url = re.sub(r'\widthMax=120&heightMax=200', 'widthMax=64&heightMax=64', cover_url)
|
||||
cover_url = re.sub(r'widthMax=120&heightMax=200', 'widthMax=64&heightMax=64', cover_url)
|
||||
title = ''.join(data.xpath('.//a[@class="title"]/text()'))
|
||||
title = re.sub(r' - ebook$', '', title)
|
||||
formats = ', '.join(data.xpath('.//ul[@class="formats_available"]/li//b/text()'))
|
||||
@ -82,7 +82,7 @@ class NextoStore(BasicStoreConfig, StorePlugin):
|
||||
counter -= 1
|
||||
|
||||
s = SearchResult()
|
||||
s.cover_url = 'http://www.nexto.pl' + cover_url
|
||||
s.cover_url = cover_url if cover_url[:4] == 'http' else 'http://www.nexto.pl' + cover_url
|
||||
s.title = title.strip()
|
||||
s.author = author.strip()
|
||||
s.price = price
|
||||
|
@ -1,7 +1,7 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from __future__ import (unicode_literals, division, absolute_import, print_function)
|
||||
store_version = 2 # Needed for dynamic plugin loading
|
||||
store_version = 3 # Needed for dynamic plugin loading
|
||||
|
||||
__license__ = 'GPL 3'
|
||||
__copyright__ = '2011-2013, Tomasz Długosz <tomek3d@gmail.com>'
|
||||
@ -41,7 +41,7 @@ class VirtualoStore(BasicStoreConfig, StorePlugin):
|
||||
url = 'http://virtualo.pl/?q=' + urllib.quote(query) + '&f=format_id:4,6,3'
|
||||
|
||||
br = browser()
|
||||
no_drm_pattern = re.compile("Znak wodny")
|
||||
no_drm_pattern = re.compile(r'Znak wodny|Brak')
|
||||
|
||||
counter = max_results
|
||||
with closing(br.open(url, timeout=timeout)) as f:
|
||||
@ -58,8 +58,8 @@ class VirtualoStore(BasicStoreConfig, StorePlugin):
|
||||
cover_url = ''.join(data.xpath('.//div[@class="list_middle_left"]//a//img/@src'))
|
||||
title = ''.join(data.xpath('.//div[@class="list_title list_text_left"]/a/text()'))
|
||||
author = ', '.join(data.xpath('.//div[@class="list_authors list_text_left"]/a/text()'))
|
||||
formats = [ form.split('_')[-1].replace('.png', '') for form in data.xpath('.//div[@style="width:55%;float:left;text-align:left;height:18px;"]//a/img/@src')]
|
||||
nodrm = no_drm_pattern.search(''.join(data.xpath('.//div[@style="width:45%;float:right;text-align:right;height:18px;"]/div/div/text()')))
|
||||
formats = [ form.split('_')[-1].replace('.png', '') for form in data.xpath('.//div[@style="width:55%;float:left;text-align:left;height:18px;"]//a/span/img/@src')]
|
||||
nodrm = no_drm_pattern.search(''.join(data.xpath('.//div[@style="width:45%;float:right;text-align:right;height:18px;"]//span[@class="prompt_preview"]/text()')))
|
||||
|
||||
counter -= 1
|
||||
|
||||
@ -70,6 +70,6 @@ class VirtualoStore(BasicStoreConfig, StorePlugin):
|
||||
s.price = price + ' zł'
|
||||
s.detail_item = 'http://virtualo.pl' + id.strip().split('http://')[0]
|
||||
s.formats = ', '.join(formats).upper()
|
||||
s.drm = SearchResult.DRM_UNLOCKED if nodrm else SearchResult.DRM_UNKNOWN
|
||||
s.drm = SearchResult.DRM_UNLOCKED if nodrm else SearchResult.DRM_LOCKED
|
||||
|
||||
yield s
|
||||
|
@ -1,7 +1,7 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from __future__ import (unicode_literals, division, absolute_import, print_function)
|
||||
store_version = 1 # Needed for dynamic plugin loading
|
||||
store_version = 2 # Needed for dynamic plugin loading
|
||||
|
||||
__license__ = 'GPL 3'
|
||||
__copyright__ = '2011, John Schember <john@nachtimwald.com>'
|
||||
@ -24,8 +24,8 @@ from calibre.gui2.store.web_store_dialog import WebStoreDialog
|
||||
class WaterstonesUKStore(BasicStoreConfig, StorePlugin):
|
||||
|
||||
def open(self, parent=None, detail_item=None, external=False):
|
||||
url = 'http://clkuk.tradedoubler.com/click?p=51196&a=1951604&g=19333484'
|
||||
url_details = 'http://clkuk.tradedoubler.com/click?p(51196)a(1951604)g(16460516)url({0})'
|
||||
url = 'http://www.awin1.com/awclick.php?mid=3787&id=120917'
|
||||
url_details = 'http://www.awin1.com/cread.php?awinmid=3787&awinaffid=120917&clickref=&p={0}'
|
||||
|
||||
if external or self.config.get('open_external', False):
|
||||
if detail_item:
|
||||
|
@ -18,7 +18,7 @@ from PyQt4.Qt import (QPushButton, QFrame, QVariant, QMenu, QInputDialog,
|
||||
|
||||
from calibre.ebooks.oeb.polish.container import get_container, AZW3Container
|
||||
from calibre.ebooks.oeb.polish.toc import (
|
||||
get_toc, add_id, TOC, commit_toc, from_xpaths, from_links)
|
||||
get_toc, add_id, TOC, commit_toc, from_xpaths, from_links, from_files)
|
||||
from calibre.gui2 import Application, error_dialog, gprefs
|
||||
from calibre.gui2.progress_indicator import ProgressIndicator
|
||||
from calibre.gui2.toc.location import ItemEdit
|
||||
@ -126,6 +126,7 @@ class ItemView(QFrame): # {{{
|
||||
go_to_root = pyqtSignal()
|
||||
create_from_xpath = pyqtSignal(object)
|
||||
create_from_links = pyqtSignal()
|
||||
create_from_files = pyqtSignal()
|
||||
flatten_toc = pyqtSignal()
|
||||
|
||||
def __init__(self, parent):
|
||||
@ -183,6 +184,15 @@ class ItemView(QFrame): # {{{
|
||||
)))
|
||||
l.addWidget(b)
|
||||
|
||||
self.cfb = b = QPushButton(_('Generate ToC from &files'))
|
||||
b.clicked.connect(self.create_from_files)
|
||||
b.setToolTip(textwrap.fill(_(
|
||||
'Generate a Table of Contents from individual files in the book.'
|
||||
' Each entry in the ToC will point to the start of the file, the'
|
||||
' text of the entry will be the "first line" of text from the file.'
|
||||
)))
|
||||
l.addWidget(b)
|
||||
|
||||
self.xpb = b = QPushButton(_('Generate ToC from &XPath'))
|
||||
b.clicked.connect(self.create_from_user_xpath)
|
||||
b.setToolTip(textwrap.fill(_(
|
||||
@ -549,11 +559,11 @@ class TOCView(QWidget): # {{{
|
||||
b.setToolTip(_('Remove all selected entries'))
|
||||
b.clicked.connect(self.del_items)
|
||||
|
||||
self.left_button = b = QToolButton(self)
|
||||
self.right_button = b = QToolButton(self)
|
||||
b.setIcon(QIcon(I('forward.png')))
|
||||
b.setIconSize(QSize(ICON_SIZE, ICON_SIZE))
|
||||
l.addWidget(b, 4, 3)
|
||||
b.setToolTip(_('Unindent the current entry [Ctrl+Left]'))
|
||||
b.setToolTip(_('Indent the current entry [Ctrl+Right]'))
|
||||
b.clicked.connect(self.tocw.move_right)
|
||||
|
||||
self.down_button = b = QToolButton(self)
|
||||
@ -577,6 +587,7 @@ class TOCView(QWidget): # {{{
|
||||
i.add_new_item.connect(self.add_new_item)
|
||||
i.create_from_xpath.connect(self.create_from_xpath)
|
||||
i.create_from_links.connect(self.create_from_links)
|
||||
i.create_from_files.connect(self.create_from_files)
|
||||
i.flatten_item.connect(self.flatten_item)
|
||||
i.flatten_toc.connect(self.flatten_toc)
|
||||
i.go_to_root.connect(self.go_to_root)
|
||||
@ -778,6 +789,14 @@ class TOCView(QWidget): # {{{
|
||||
_('No links were found that could be added to the Table of Contents.'), show=True)
|
||||
self.insert_toc_fragment(toc)
|
||||
|
||||
def create_from_files(self):
|
||||
toc = from_files(self.ebook)
|
||||
if len(toc) == 0:
|
||||
return error_dialog(self, _('No items found'),
|
||||
_('No files were found that could be added to the Table of Contents.'), show=True)
|
||||
self.insert_toc_fragment(toc)
|
||||
|
||||
|
||||
# }}}
|
||||
|
||||
class TOCEditor(QDialog): # {{{
|
||||
|
@ -54,7 +54,7 @@ def get_parser(usage):
|
||||
def get_db(dbpath, options):
|
||||
global do_notify
|
||||
if options.library_path is not None:
|
||||
dbpath = options.library_path
|
||||
dbpath = os.path.expanduser(options.library_path)
|
||||
if dbpath is None:
|
||||
raise ValueError('No saved library path, either run the GUI or use the'
|
||||
' --with-library option')
|
||||
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user