mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-07 10:14:46 -04:00
Sync to trunk.
This commit is contained in:
commit
724ee9e7a1
@ -41,6 +41,8 @@ def days_in_month(year, month):
|
|||||||
return ans
|
return ans
|
||||||
|
|
||||||
def rationalize_country(country):
|
def rationalize_country(country):
|
||||||
|
if not country:
|
||||||
|
return 'Unknown'
|
||||||
if re.match('(?i)(US|USA|America)', country):
|
if re.match('(?i)(US|USA|America)', country):
|
||||||
country = 'USA'
|
country = 'USA'
|
||||||
elif re.match('(?i)(UK|Britain|england)', country):
|
elif re.match('(?i)(UK|Britain|england)', country):
|
||||||
|
@ -951,12 +951,10 @@ class BasicNewsRecipe(Recipe):
|
|||||||
def error_in_article_download(self, request, traceback):
|
def error_in_article_download(self, request, traceback):
|
||||||
self.jobs_done += 1
|
self.jobs_done += 1
|
||||||
self.log.error(_('Failed to download article: %s from %s\n')%(request.article.title, request.article.url))
|
self.log.error(_('Failed to download article: %s from %s\n')%(request.article.title, request.article.url))
|
||||||
debug = request.stream.getvalue().decode('utf-8', 'ignore')
|
|
||||||
self.log.debug(debug)
|
|
||||||
self.log.debug(traceback)
|
self.log.debug(traceback)
|
||||||
self.log.debug('\n')
|
self.log.debug('\n')
|
||||||
self.report_progress(float(self.jobs_done)/len(self.jobs), _('Article download failed: %s')%request.article.title)
|
self.report_progress(float(self.jobs_done)/len(self.jobs), _('Article download failed: %s')%request.article.title)
|
||||||
self.failed_downloads.append((request.feed, request.article, debug))
|
self.failed_downloads.append((request.feed, request.article, traceback))
|
||||||
|
|
||||||
def parse_feeds(self):
|
def parse_feeds(self):
|
||||||
'''
|
'''
|
||||||
|
@ -6,56 +6,90 @@ __copyright__ = '2008, Darko Miletic <darko.miletic at gmail.com>'
|
|||||||
time.com
|
time.com
|
||||||
'''
|
'''
|
||||||
|
|
||||||
|
import re
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
class Time(BasicNewsRecipe):
|
class Time(BasicNewsRecipe):
|
||||||
title = u'Time'
|
title = u'Time'
|
||||||
__author__ = 'Kovid Goyal'
|
__author__ = 'Kovid Goyal'
|
||||||
description = 'Weekly magazine'
|
description = 'Weekly magazine'
|
||||||
oldest_article = 7
|
|
||||||
max_articles_per_feed = 100
|
|
||||||
encoding = 'utf-8'
|
encoding = 'utf-8'
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
language = _('English')
|
language = _('English')
|
||||||
use_embedded_content = False
|
|
||||||
|
|
||||||
keep_only_tags = [dict(name='div', attrs={'class':'tout1'})]
|
remove_tags_before = dict(id="artHd")
|
||||||
remove_tags_after = [dict(id='connectStory')]
|
remove_tags_after = {'class':"ltCol"}
|
||||||
remove_tags = [
|
remove_tags = [
|
||||||
dict(name='ul', attrs={'class':['button', 'find']}),
|
{'class':['articleTools', 'enlarge', 'search']},
|
||||||
dict(name='div', attrs={'class':['nav', 'header', 'sectheader',
|
{'id':['quigoArticle', 'contentTools', 'articleSideBar', 'header', 'navTop']},
|
||||||
'searchWrap', 'subNav',
|
{'target':'_blank'},
|
||||||
'artTools', 'connect',
|
|
||||||
'similarrecs']}),
|
|
||||||
dict(name='div', id=['articleSideBar', 'connectStory']),
|
|
||||||
dict(name='dl', id=['links']),
|
|
||||||
]
|
]
|
||||||
|
recursions = 1
|
||||||
|
match_regexps = [r'/[0-9,]+-(2|3|4|5|6|7|8|9)(,\d+){0,1}.html']
|
||||||
|
|
||||||
feeds = [
|
|
||||||
(u'Top Stories', u'http://feedproxy.google.com/time/topstories')
|
|
||||||
,(u'Nation', u'http://feedproxy.google.com/time/nation')
|
|
||||||
,(u'Business & Tech', u'http://feedproxy.google.com/time/business')
|
|
||||||
,(u'Science & Tech', u'http://feedproxy.google.com/time/scienceandhealth')
|
|
||||||
,(u'World', u'http://feedproxy.google.com/time/world')
|
|
||||||
,(u'Entertainment', u'http://feedproxy.google.com/time/entertainment')
|
|
||||||
,(u'Politics', u'http://feedproxy.google.com/time/politics')
|
|
||||||
,(u'Travel', u'http://feedproxy.google.com/time/travel')
|
|
||||||
]
|
|
||||||
|
|
||||||
def get_article_url(self, article):
|
def parse_index(self):
|
||||||
return article.get('guid', article['link'])
|
soup = self.index_to_soup('http://www.time.com/time/magazine')
|
||||||
|
img = soup.find('a', title="View Large Cover", href=True)
|
||||||
def get_cover_url(self):
|
|
||||||
soup = self.index_to_soup('http://www.time.com/time/')
|
|
||||||
img = soup.find('img', alt='Current Time.com Cover', width='107')
|
|
||||||
if img is not None:
|
if img is not None:
|
||||||
return img.get('src', None)
|
cover_url = 'http://www.time.com'+img['href']
|
||||||
|
try:
|
||||||
|
nsoup = self.index_to_soup(cover_url)
|
||||||
|
img = nsoup.find('img', src=re.compile('archive/covers'))
|
||||||
|
if img is not None:
|
||||||
|
self.cover_url = img['src']
|
||||||
|
except:
|
||||||
|
self.log.exception('Failed to fetch cover')
|
||||||
|
|
||||||
def print_version(self, url):
|
|
||||||
try:
|
feeds = []
|
||||||
soup = self.index_to_soup(url)
|
parent = soup.find(id='tocGuts')
|
||||||
print_link = soup.find('a', {'id':'prt'})
|
for seched in parent.findAll(attrs={'class':'toc_seched'}):
|
||||||
return 'http://www.time.com' + print_link['href']
|
section = self.tag_to_string(seched).capitalize()
|
||||||
except:
|
articles = list(self.find_articles(seched))
|
||||||
self.log_exception('Failed to find print version for '+url)
|
feeds.append((section, articles))
|
||||||
return ''
|
|
||||||
|
return feeds
|
||||||
|
|
||||||
|
def find_articles(self, seched):
|
||||||
|
for a in seched.findNextSiblings('a', href=True, attrs={'class':'toc_hed'}):
|
||||||
|
yield {
|
||||||
|
'title' : self.tag_to_string(a),
|
||||||
|
'url' : 'http://www.time.com'+a['href'],
|
||||||
|
'date' : '',
|
||||||
|
'description' : self.article_description(a)
|
||||||
|
}
|
||||||
|
|
||||||
|
def article_description(self, a):
|
||||||
|
ans = []
|
||||||
|
while True:
|
||||||
|
t = a.nextSibling
|
||||||
|
if t is None:
|
||||||
|
break
|
||||||
|
a = t
|
||||||
|
if getattr(t, 'name', False):
|
||||||
|
if t.get('class', '') == 'toc_parens' or t.name == 'br':
|
||||||
|
continue
|
||||||
|
if t.name in ('div', 'a'):
|
||||||
|
break
|
||||||
|
ans.append(self.tag_to_string(t))
|
||||||
|
else:
|
||||||
|
ans.append(unicode(t))
|
||||||
|
return u' '.join(ans).replace(u'\xa0', u'').strip()
|
||||||
|
|
||||||
|
def postprocess_html(self, soup, first_page):
|
||||||
|
div = soup.find(attrs={'class':'artPag'})
|
||||||
|
if div is not None:
|
||||||
|
div.extract()
|
||||||
|
if not first_page:
|
||||||
|
for cls in ('photoBkt', 'artHd'):
|
||||||
|
div = soup.find(attrs={'class':cls})
|
||||||
|
if div is not None:
|
||||||
|
div.extract()
|
||||||
|
div = soup.find(attrs={'class':'artTxt'})
|
||||||
|
if div is not None:
|
||||||
|
p = div.find('p')
|
||||||
|
if p is not None:
|
||||||
|
p.extract()
|
||||||
|
|
||||||
|
return soup
|
||||||
|
Loading…
x
Reference in New Issue
Block a user