Sync to trunk.
@ -40,6 +40,7 @@ recipes/.gitignore
|
||||
recipes/README.md
|
||||
recipes/icon_checker.py
|
||||
recipes/readme_updater.py
|
||||
recipes/garfield.recipe
|
||||
recipes/katalog_egazeciarz.recipe
|
||||
recipes/tv_axnscifi.recipe
|
||||
recipes/tv_comedycentral.recipe
|
||||
@ -63,6 +64,7 @@ recipes/tv_tvppolonia.recipe
|
||||
recipes/tv_tvpuls.recipe
|
||||
recipes/tv_viasathistory.recipe
|
||||
recipes/icons/katalog_egazeciarz.png
|
||||
recipes/icons/garfield.png
|
||||
recipes/icons/tv_axnscifi.png
|
||||
recipes/icons/tv_comedycentral.png
|
||||
recipes/icons/tv_discoveryscience.png
|
||||
|
@ -20,6 +20,58 @@
|
||||
# new recipes:
|
||||
# - title:
|
||||
|
||||
- version: 0.9.26
|
||||
date: 2013-04-05
|
||||
|
||||
new features:
|
||||
- title: "PDF Output: Allow using templates to create arbitrary headers and footers. Look under PDF Output in the conversion dialog for this feature."
|
||||
|
||||
- title: "ToC Editor: Allow generating the ToC directly from individual files inside the ebook. Useful for EPUBs that have individual chapters in single files."
|
||||
tickets: [1163520]
|
||||
|
||||
- title: "ToC Editor: Add buttons to indent/unindent the current entry"
|
||||
|
||||
- title: "ToC Editor: Right-click menu to perform various useful actions on entries in the ToC"
|
||||
|
||||
- title: "Column icons: Allow use of wide images as column icons"
|
||||
|
||||
- title: "Add USB ids for the Palm Pre2 and Samsung Galaxy phone to the device drivers"
|
||||
tickets: [1162293,1163115]
|
||||
|
||||
bug fixes:
|
||||
- title: "PDF Output: Fix generating page numbers causing links to not work."
|
||||
tickets: [1162573]
|
||||
|
||||
- title: "Wrong filename output in error message when 'Guide reference not found'"
|
||||
tickets: [1163659]
|
||||
|
||||
- title: "Get Books: Update Amazon, Barnes & Noble, Waterstones and Gutenberg store plugins for website change"
|
||||
|
||||
- title: "PDF Output: Fix 1 pixel wide left and top margins on the cover page for some PDF conversions due to incorrect rounding."
|
||||
tickets: [1162054]
|
||||
|
||||
- title: "ToC Editor: Fix drag and drop of multiple items resulting in the dropped items being in random order sometimes."
|
||||
tickets: [1161999]
|
||||
|
||||
improved recipes:
|
||||
- Financial Times UK
|
||||
- Sing Tao Daily
|
||||
- Apple Daily
|
||||
- A List Apart
|
||||
- Business Week
|
||||
- Harpers printed edition
|
||||
- Harvard Business Review
|
||||
|
||||
new recipes:
|
||||
- title: AM730
|
||||
author: Eddie Lau
|
||||
|
||||
- title: Arret sur images
|
||||
author: Francois D
|
||||
|
||||
- title: Diario de Noticias
|
||||
author: Jose Pinto
|
||||
|
||||
- version: 0.9.25
|
||||
date: 2013-03-29
|
||||
|
||||
|
@ -1,3 +1,4 @@
|
||||
import re
|
||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||
from collections import OrderedDict
|
||||
|
||||
@ -39,7 +40,7 @@ class BusinessWeekMagazine(BasicNewsRecipe):
|
||||
title=self.tag_to_string(div.a).strip()
|
||||
url=div.a['href']
|
||||
soup0 = self.index_to_soup(url)
|
||||
urlprint=soup0.find('li', attrs={'class':'print tracked'}).a['href']
|
||||
urlprint=soup0.find('a', attrs={'href':re.compile('.*printer.*')})['href']
|
||||
articles.append({'title':title, 'url':urlprint, 'description':'', 'date':''})
|
||||
|
||||
|
||||
@ -56,7 +57,7 @@ class BusinessWeekMagazine(BasicNewsRecipe):
|
||||
title=self.tag_to_string(div.a).strip()
|
||||
url=div.a['href']
|
||||
soup0 = self.index_to_soup(url)
|
||||
urlprint=soup0.find('li', attrs={'class':'print tracked'}).a['href']
|
||||
urlprint=soup0.find('a', attrs={'href':re.compile('.*printer.*')})['href']
|
||||
articles.append({'title':title, 'url':urlprint, 'description':desc, 'date':''})
|
||||
|
||||
if articles:
|
||||
|
23
recipes/diario_de_noticias.recipe
Normal file
@ -0,0 +1,23 @@
|
||||
# vim:fileencoding=UTF-8
|
||||
|
||||
from __future__ import unicode_literals
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class AdvancedUserRecipe1365070687(BasicNewsRecipe):
|
||||
title ='Diário de Notícias'
|
||||
oldest_article = 7
|
||||
language = 'pt'
|
||||
__author__ = 'Jose Pinto'
|
||||
max_articles_per_feed = 100
|
||||
keep_only_tags = [dict(name='div', attrs={'id':'cln-esqmid'}) ]
|
||||
remove_tags = [ dict(name='table', attrs={'class':'TabFerramentasInf'}) ]
|
||||
|
||||
feeds = [(u'Portugal', u'http://feeds.dn.pt/DN-Portugal'),
|
||||
(u'Globo', u'http://feeds.dn.pt/DN-Globo'),
|
||||
(u'Economia', u'http://feeds.dn.pt/DN-Economia'),
|
||||
(u'Ci\xeancia', u'http://feeds.dn.pt/DN-Ciencia'),
|
||||
(u'Artes', u'http://feeds.dn.pt/DN-Artes'),
|
||||
(u'TV & Media', u'http://feeds.dn.pt/DN-Media'),
|
||||
(u'Opini\xe3o', u'http://feeds.dn.pt/DN-Opiniao'),
|
||||
(u'Pessoas', u'http://feeds.dn.pt/DN-Pessoas')
|
||||
]
|
17
recipes/economia.recipe
Normal file
@ -0,0 +1,17 @@
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class AdvancedUserRecipe1314326622(BasicNewsRecipe):
|
||||
title = u'Economia'
|
||||
__author__ = 'Manish Bhattarai'
|
||||
description = 'Economia - Intelligence & Insight for ICAEW Members'
|
||||
language = 'en_GB'
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 25
|
||||
masthead_url = 'http://economia.icaew.com/~/media/Images/Design%20Images/Economia_Red_website.ashx'
|
||||
cover_url = 'http://economia.icaew.com/~/media/Images/Design%20Images/Economia_Red_website.ashx'
|
||||
no_stylesheets = True
|
||||
remove_empty_feeds = True
|
||||
remove_tags_before = dict(id='content')
|
||||
remove_tags_after = dict(id='stars-wrapper')
|
||||
remove_tags = [dict(attrs={'class':['floatR', 'sharethis', 'rating clearfix']})]
|
||||
feeds = [(u'News', u'http://feedity.com/icaew-com/VlNTVFRa.rss'),(u'Business', u'http://feedity.com/icaew-com/VlNTVFtS.rss'),(u'People', u'http://feedity.com/icaew-com/VlNTVFtX.rss'),(u'Opinion', u'http://feedity.com/icaew-com/VlNTVFtW.rss'),(u'Finance', u'http://feedity.com/icaew-com/VlNTVFtV.rss')]
|
@ -12,12 +12,6 @@ class EsensjaRSS(BasicNewsRecipe):
|
||||
language = 'pl'
|
||||
encoding = 'utf-8'
|
||||
INDEX = 'http://www.esensja.pl'
|
||||
extra_css = '''.t-title {font-size: x-large; font-weight: bold; text-align: left}
|
||||
.t-author {font-size: x-small; text-align: left}
|
||||
.t-title2 {font-size: x-small; font-style: italic; text-align: left}
|
||||
.text {font-size: small; text-align: left}
|
||||
.annot-ref {font-style: italic; text-align: left}
|
||||
'''
|
||||
cover_url = ''
|
||||
masthead_url = 'http://esensja.pl/img/wrss.gif'
|
||||
use_embedded_content = False
|
||||
|
@ -110,10 +110,12 @@ class FinancialTimes(BasicNewsRecipe):
|
||||
soup = self.index_to_soup(self.INDEX)
|
||||
#dates= self.tag_to_string(soup.find('div', attrs={'class':'btm-links'}).find('div'))
|
||||
#self.timefmt = ' [%s]'%dates
|
||||
section_title = 'Untitled'
|
||||
|
||||
for column in soup.findAll('div', attrs = {'class':'feedBoxes clearfix'}):
|
||||
for section in column. findAll('div', attrs = {'class':'feedBox'}):
|
||||
section_title=self.tag_to_string(section.find('h4'))
|
||||
sectiontitle=self.tag_to_string(section.find('h4'))
|
||||
if '...' not in sectiontitle: section_title=sectiontitle
|
||||
for article in section.ul.findAll('li'):
|
||||
articles = []
|
||||
title=self.tag_to_string(article.a)
|
||||
|
53
recipes/forbes_pl.recipe
Normal file
@ -0,0 +1,53 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
import datetime
|
||||
import re
|
||||
|
||||
class forbes_pl(BasicNewsRecipe):
|
||||
title = u'Forbes.pl'
|
||||
__author__ = 'Artur Stachecki <artur.stachecki@gmail.com>'
|
||||
language = 'pl'
|
||||
description = u'Biznes, finanse, gospodarka, strategie, wiadomości gospodarcze, analizy finasowe i strategiczne.'
|
||||
oldest_article = 1
|
||||
index = 'http://www.forbes.pl'
|
||||
cover_url = 'http://www.forbes.pl/resources/front/images/logo.png'
|
||||
max_articles_per_feed = 100
|
||||
extra_css = '.Block-Photo {float:left; max-width: 300px; margin-right: 5px;}'
|
||||
preprocess_regexps = [(re.compile(ur'<p>(<strong>)?(Czytaj|Zobacz) (też|także):.*?</p>', re.DOTALL), lambda match: ''), (re.compile(ur'<strong>Zobacz:.*?</strong>', re.DOTALL), lambda match: '')]
|
||||
remove_javascript = True
|
||||
no_stylesheets = True
|
||||
now = datetime.datetime.now()
|
||||
yesterday = now - datetime.timedelta(hours=24)
|
||||
yesterday = yesterday.strftime("%d.%m.%Y %H:%M:%S")
|
||||
pages_count = 4
|
||||
keep_only_tags = [dict(attrs={'class':['Block-Node Content-Article ', 'Block-Node Content-Article piano-closed']})]
|
||||
remove_tags = [dict(attrs={'class':['Keywords Styled', 'twitter-share-button', 'Block-List-Related Block-List']})]
|
||||
|
||||
feeds = [(u'Wszystkie', 'http://www.forbes.pl/rss')]
|
||||
|
||||
'''def preprocess_html(self, soup):
|
||||
self.append_page(soup, soup.body)
|
||||
return soup
|
||||
|
||||
|
||||
def append_page(self, soup, appendtag):
|
||||
cleanup = False
|
||||
nexturl = appendtag.find('a', attrs={'class':'next'})
|
||||
if nexturl:
|
||||
cleanup = True
|
||||
while nexturl:
|
||||
soup2 = self.index_to_soup(self.index + nexturl['href'])
|
||||
nexturl = soup2.find('a', attrs={'class':'next'})
|
||||
pagetext = soup2.findAll(id='article-body-wrapper')
|
||||
if not pagetext:
|
||||
pagetext = soup2.findAll(attrs={'class':'Article-Entry Styled'})
|
||||
for comment in pagetext.findAll(text=lambda text:isinstance(text, Comment)):
|
||||
comment.extract()
|
||||
pos = len(appendtag.contents)
|
||||
appendtag.insert(pos, pagetext)
|
||||
if cleanup:
|
||||
for r in appendtag.findAll(attrs={'class':'paginator'}):
|
||||
r.extract()'''
|
108
recipes/galaxys_edge.recipe
Normal file
@ -0,0 +1,108 @@
|
||||
from __future__ import with_statement
|
||||
__license__ = 'GPL 3'
|
||||
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class GalaxyEdge(BasicNewsRecipe):
|
||||
title = u'The Galaxy\'s Edge'
|
||||
language = 'en'
|
||||
|
||||
oldest_article = 7
|
||||
__author__ = 'Krittika Goyal'
|
||||
no_stylesheets = True
|
||||
|
||||
auto_cleanup = True
|
||||
|
||||
#keep_only_tags = [dict(id='content')]
|
||||
#remove_tags = [dict(attrs={'class':['article-links', 'breadcr']}),
|
||||
#dict(id=['email-section', 'right-column', 'printfooter', 'topover',
|
||||
#'slidebox', 'th_footer'])]
|
||||
|
||||
extra_css = '.photo-caption { font-size: smaller }'
|
||||
|
||||
def parse_index(self):
|
||||
soup = self.index_to_soup('http://www.galaxysedge.com/')
|
||||
main = soup.find('table', attrs={'width':'911'})
|
||||
toc = main.find('td', attrs={'width':'225'})
|
||||
|
||||
|
||||
|
||||
current_section = None
|
||||
current_articles = []
|
||||
feeds = []
|
||||
c = 0
|
||||
for x in toc.findAll(['p']):
|
||||
c = c+1
|
||||
if c == 5:
|
||||
if current_articles and current_section:
|
||||
feeds.append((current_section, current_articles))
|
||||
edwo = x.find('a')
|
||||
current_section = self.tag_to_string(edwo)
|
||||
current_articles = []
|
||||
self.log('\tFound section:', current_section)
|
||||
title = self.tag_to_string(edwo)
|
||||
url = edwo.get('href', True)
|
||||
url = 'http://www.galaxysedge.com/'+url
|
||||
print(title)
|
||||
print(c)
|
||||
if not url or not title:
|
||||
continue
|
||||
self.log('\t\tFound article:', title)
|
||||
self.log('\t\t\t', url)
|
||||
current_articles.append({'title': title, 'url':url,
|
||||
'description':'', 'date':''})
|
||||
elif c>5:
|
||||
current_section = self.tag_to_string(x.find('b'))
|
||||
current_articles = []
|
||||
self.log('\tFound section:', current_section)
|
||||
for y in x.findAll('a'):
|
||||
title = self.tag_to_string(y)
|
||||
url = y.get('href', True)
|
||||
url = 'http://www.galaxysedge.com/'+url
|
||||
print(title)
|
||||
if not url or not title:
|
||||
continue
|
||||
self.log('\t\tFound article:', title)
|
||||
self.log('\t\t\t', url)
|
||||
current_articles.append({'title': title, 'url':url,
|
||||
'description':'', 'date':''})
|
||||
if current_articles and current_section:
|
||||
feeds.append((current_section, current_articles))
|
||||
|
||||
return feeds
|
||||
|
||||
|
||||
|
||||
|
||||
#def preprocess_raw_html(self, raw, url):
|
||||
#return raw.replace('<body><p>', '<p>').replace('</p></body>', '</p>')
|
||||
|
||||
#def postprocess_html(self, soup, first_fetch):
|
||||
#for t in soup.findAll(['table', 'tr', 'td','center']):
|
||||
#t.name = 'div'
|
||||
#return soup
|
||||
|
||||
#def parse_index(self):
|
||||
#today = time.strftime('%Y-%m-%d')
|
||||
#soup = self.index_to_soup(
|
||||
#'http://www.thehindu.com/todays-paper/tp-index/?date=' + today)
|
||||
#div = soup.find(id='left-column')
|
||||
#feeds = []
|
||||
#current_section = None
|
||||
#current_articles = []
|
||||
#for x in div.findAll(['h3', 'div']):
|
||||
#if current_section and x.get('class', '') == 'tpaper':
|
||||
#a = x.find('a', href=True)
|
||||
#if a is not None:
|
||||
#current_articles.append({'url':a['href']+'?css=print',
|
||||
#'title':self.tag_to_string(a), 'date': '',
|
||||
#'description':''})
|
||||
#if x.name == 'h3':
|
||||
#if current_section and current_articles:
|
||||
#feeds.append((current_section, current_articles))
|
||||
#current_section = self.tag_to_string(x)
|
||||
#current_articles = []
|
||||
#return feeds
|
||||
|
||||
|
@ -10,7 +10,7 @@ krakow.gazeta.pl
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class gw_krakow(BasicNewsRecipe):
|
||||
title = u'Gazeta.pl Kraków'
|
||||
title = u'Gazeta Wyborcza Kraków'
|
||||
__author__ = 'teepel <teepel44@gmail.com> based on GW from fenuks'
|
||||
language = 'pl'
|
||||
description =u'Wiadomości z Krakowa na portalu Gazeta.pl.'
|
||||
|
@ -5,7 +5,7 @@ import string
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class GazetaPlSzczecin(BasicNewsRecipe):
|
||||
title = u'Gazeta.pl Szczecin'
|
||||
title = u'Gazeta Wyborcza Szczecin'
|
||||
description = u'Wiadomości ze Szczecina na portalu Gazeta.pl.'
|
||||
__author__ = u'Michał Szkutnik'
|
||||
__license__ = u'GPL v3'
|
||||
|
@ -10,7 +10,7 @@ warszawa.gazeta.pl
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class gw_wawa(BasicNewsRecipe):
|
||||
title = u'Gazeta.pl Warszawa'
|
||||
title = u'Gazeta Wyborcza Warszawa'
|
||||
__author__ = 'teepel <teepel44@gmail.com> based on GW from fenuks'
|
||||
language = 'pl'
|
||||
description ='Wiadomości z Warszawy na portalu Gazeta.pl.'
|
||||
|
@ -3,7 +3,7 @@ from calibre.web.feeds.news import BasicNewsRecipe
|
||||
from calibre.ebooks.BeautifulSoup import Comment
|
||||
|
||||
class Gazeta_Wyborcza(BasicNewsRecipe):
|
||||
title = u'Gazeta.pl'
|
||||
title = u'Gazeta Wyborcza'
|
||||
__author__ = 'fenuks, Artur Stachecki'
|
||||
language = 'pl'
|
||||
description = 'Wiadomości z Polski i ze świata. Serwisy tematyczne i lokalne w 20 miastach.'
|
||||
|
@ -20,7 +20,7 @@ class HBR(BasicNewsRecipe):
|
||||
'articleToolbarTopRD', 'pageRightSubColumn', 'pageRightColumn',
|
||||
'todayOnHBRListWidget', 'mostWidget', 'keepUpWithHBR',
|
||||
'mailingListTout', 'partnerCenter', 'pageFooter',
|
||||
'superNavHeadContainer', 'hbrDisqus',
|
||||
'superNavHeadContainer', 'hbrDisqus', 'article-toolbox',
|
||||
'articleToolbarTop', 'articleToolbarBottom', 'articleToolbarRD']),
|
||||
dict(name='iframe')]
|
||||
extra_css = '''
|
||||
|
BIN
recipes/icons/forbes_pl.png
Normal file
After Width: | Height: | Size: 1.2 KiB |
Before Width: | Height: | Size: 802 B After Width: | Height: | Size: 294 B |
Before Width: | Height: | Size: 802 B After Width: | Height: | Size: 294 B |
Before Width: | Height: | Size: 802 B After Width: | Height: | Size: 294 B |
Before Width: | Height: | Size: 802 B After Width: | Height: | Size: 294 B |
BIN
recipes/icons/slashdot.png
Normal file
After Width: | Height: | Size: 250 B |
BIN
recipes/icons/sportowefakty.png
Normal file
After Width: | Height: | Size: 511 B |
BIN
recipes/icons/wysokie_obcasy.png
Normal file
After Width: | Height: | Size: 205 B |
@ -1,64 +1,44 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2008-2013, Darko Miletic <darko.miletic at gmail.com>'
|
||||
'''
|
||||
newyorker.com
|
||||
'''
|
||||
|
||||
'''
|
||||
www.canada.com
|
||||
'''
|
||||
import re
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup
|
||||
|
||||
class NewYorker(BasicNewsRecipe):
|
||||
title = 'The New Yorker'
|
||||
__author__ = 'Darko Miletic'
|
||||
description = 'The best of US journalism'
|
||||
oldest_article = 15
|
||||
language = 'en'
|
||||
max_articles_per_feed = 100
|
||||
no_stylesheets = True
|
||||
use_embedded_content = False
|
||||
publisher = 'Conde Nast Publications'
|
||||
category = 'news, politics, USA'
|
||||
encoding = 'cp1252'
|
||||
publication_type = 'magazine'
|
||||
masthead_url = 'http://www.newyorker.com/css/i/hed/logo.gif'
|
||||
extra_css = """
|
||||
body {font-family: "Times New Roman",Times,serif}
|
||||
.articleauthor{color: #9F9F9F;
|
||||
font-family: Arial, sans-serif;
|
||||
font-size: small;
|
||||
text-transform: uppercase}
|
||||
.rubric,.dd,h6#credit{color: #CD0021;
|
||||
font-family: Arial, sans-serif;
|
||||
font-size: small;
|
||||
text-transform: uppercase}
|
||||
.descender:first-letter{display: inline; font-size: xx-large; font-weight: bold}
|
||||
.dd,h6#credit{color: gray}
|
||||
.c{display: block}
|
||||
.caption,h2#articleintro{font-style: italic}
|
||||
.caption{font-size: small}
|
||||
"""
|
||||
|
||||
conversion_options = {
|
||||
'comment' : description
|
||||
, 'tags' : category
|
||||
, 'publisher' : publisher
|
||||
, 'language' : language
|
||||
}
|
||||
|
||||
keep_only_tags = [dict(name='div', attrs={'id':'pagebody'})]
|
||||
remove_tags = [
|
||||
dict(name=['meta','iframe','base','link','embed','object'])
|
||||
,dict(attrs={'class':['utils','socialUtils','articleRailLinks','icons','social-utils-top','entry-keywords','entry-categories','utilsPrintEmail'] })
|
||||
,dict(attrs={'id':['show-header','show-footer'] })
|
||||
]
|
||||
remove_tags_after = dict(attrs={'class':'entry-content'})
|
||||
remove_attributes = ['lang']
|
||||
feeds = [(u'The New Yorker', u'http://www.newyorker.com/services/mrss/feeds/everything.xml')]
|
||||
title = u'New Yorker Magazine'
|
||||
newyorker_prefix = 'http://m.newyorker.com'
|
||||
description = u'Content from the New Yorker website'
|
||||
fp_tag = 'CAN_TC'
|
||||
|
||||
def print_version(self, url):
|
||||
return url + '?printable=true¤tPage=all'
|
||||
masthead_url = 'http://www.newyorker.com/images/elements/print/newyorker_printlogo.gif'
|
||||
|
||||
def image_url_processor(self, baseurl, url):
|
||||
return url.strip()
|
||||
compress_news_images = True
|
||||
compress_news_images_auto_size = 8
|
||||
scale_news_images_to_device = False
|
||||
scale_news_images = (768, 1024)
|
||||
|
||||
url_list = []
|
||||
language = 'en'
|
||||
__author__ = 'Nick Redding'
|
||||
no_stylesheets = True
|
||||
timefmt = ' [%b %d]'
|
||||
encoding = 'utf-8'
|
||||
extra_css = '''
|
||||
.byline { font-size:xx-small; font-weight: bold;}
|
||||
h3 { margin-bottom: 6px; }
|
||||
.caption { font-size: xx-small; font-style: italic; font-weight: normal; }
|
||||
'''
|
||||
keep_only_tags = [dict(name='div', attrs={'id':re.compile('pagebody')})]
|
||||
|
||||
remove_tags = [{'class':'socialUtils'},{'class':'entry-keywords'}]
|
||||
|
||||
def get_cover_url(self):
|
||||
cover_url = "http://www.newyorker.com/images/covers/1925/1925_02_21_p233.jpg"
|
||||
@ -68,13 +48,233 @@ class NewYorker(BasicNewsRecipe):
|
||||
cover_url = 'http://www.newyorker.com' + cover_item.div.img['src'].strip()
|
||||
return cover_url
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
for item in soup.findAll(style=True):
|
||||
del item['style']
|
||||
auth = soup.find(attrs={'id':'articleauthor'})
|
||||
if auth:
|
||||
alink = auth.find('a')
|
||||
if alink and alink.string is not None:
|
||||
txt = alink.string
|
||||
alink.replaceWith(txt)
|
||||
def fixChars(self,string):
|
||||
# Replace lsquo (\x91)
|
||||
fixed = re.sub("\x91","‘",string)
|
||||
# Replace rsquo (\x92)
|
||||
fixed = re.sub("\x92","’",fixed)
|
||||
# Replace ldquo (\x93)
|
||||
fixed = re.sub("\x93","“",fixed)
|
||||
# Replace rdquo (\x94)
|
||||
fixed = re.sub("\x94","”",fixed)
|
||||
# Replace ndash (\x96)
|
||||
fixed = re.sub("\x96","–",fixed)
|
||||
# Replace mdash (\x97)
|
||||
fixed = re.sub("\x97","—",fixed)
|
||||
fixed = re.sub("’","’",fixed)
|
||||
return fixed
|
||||
|
||||
def massageNCXText(self, description):
|
||||
# Kindle TOC descriptions won't render certain characters
|
||||
if description:
|
||||
massaged = unicode(BeautifulStoneSoup(description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES))
|
||||
# Replace '&' with '&'
|
||||
massaged = re.sub("&","&", massaged)
|
||||
return self.fixChars(massaged)
|
||||
else:
|
||||
return description
|
||||
|
||||
def populate_article_metadata(self, article, soup, first):
|
||||
if first:
|
||||
picdiv = soup.find('body').find('img')
|
||||
if picdiv is not None:
|
||||
self.add_toc_thumbnail(article,re.sub(r'links\\link\d+\\','',picdiv['src']))
|
||||
xtitle = article.text_summary.strip()
|
||||
if len(xtitle) == 0:
|
||||
desc = soup.find('meta',attrs={'property':'og:description'})
|
||||
if desc is not None:
|
||||
article.summary = article.text_summary = desc['content']
|
||||
shortparagraph = ""
|
||||
## try:
|
||||
if len(article.text_summary.strip()) == 0:
|
||||
articlebodies = soup.findAll('div',attrs={'class':'entry-content'})
|
||||
if articlebodies:
|
||||
for articlebody in articlebodies:
|
||||
if articlebody:
|
||||
paras = articlebody.findAll('p')
|
||||
for p in paras:
|
||||
refparagraph = self.massageNCXText(self.tag_to_string(p,use_alt=False)).strip()
|
||||
#account for blank paragraphs and short paragraphs by appending them to longer ones
|
||||
if len(refparagraph) > 0:
|
||||
if len(refparagraph) > 70: #approximately one line of text
|
||||
newpara = shortparagraph + refparagraph
|
||||
article.summary = article.text_summary = newpara.strip()
|
||||
return
|
||||
else:
|
||||
shortparagraph = refparagraph + " "
|
||||
if shortparagraph.strip().find(" ") == -1 and not shortparagraph.strip().endswith(":"):
|
||||
shortparagraph = shortparagraph + "- "
|
||||
else:
|
||||
article.summary = article.text_summary = self.massageNCXText(article.text_summary)
|
||||
## except:
|
||||
## self.log("Error creating article descriptions")
|
||||
## return
|
||||
|
||||
|
||||
def strip_anchors(self,soup):
|
||||
paras = soup.findAll(True)
|
||||
for para in paras:
|
||||
aTags = para.findAll('a')
|
||||
for a in aTags:
|
||||
if a.img is None:
|
||||
a.replaceWith(a.renderContents().decode('cp1252','replace'))
|
||||
return soup
|
||||
|
||||
def preprocess_html(self,soup):
|
||||
dateline = soup.find('div','published')
|
||||
byline = soup.find('div','byline')
|
||||
title = soup.find('h1','entry-title')
|
||||
if title is None:
|
||||
return self.strip_anchors(soup)
|
||||
if byline is None:
|
||||
title.append(dateline)
|
||||
return self.strip_anchors(soup)
|
||||
byline.append(dateline)
|
||||
return self.strip_anchors(soup)
|
||||
|
||||
def load_global_nav(self,soup):
|
||||
seclist = []
|
||||
ul = soup.find('ul',attrs={'id':re.compile('global-nav-menu')})
|
||||
if ul is not None:
|
||||
for li in ul.findAll('li'):
|
||||
if li.a is not None:
|
||||
securl = li.a['href']
|
||||
if securl != '/' and securl != '/magazine' and securl.startswith('/'):
|
||||
seclist.append((self.tag_to_string(li.a),self.newyorker_prefix+securl))
|
||||
return seclist
|
||||
|
||||
def exclude_url(self,url):
|
||||
if url in self.url_list:
|
||||
return True
|
||||
if not url.endswith('html'):
|
||||
return True
|
||||
if 'goings-on-about-town-app' in url:
|
||||
return True
|
||||
if 'something-to-be-thankful-for' in url:
|
||||
return True
|
||||
if '/shouts/' in url:
|
||||
return True
|
||||
if 'out-loud' in url:
|
||||
return True
|
||||
if '/rss/' in url:
|
||||
return True
|
||||
if '/video-' in url:
|
||||
return True
|
||||
self.url_list.append(url)
|
||||
return False
|
||||
|
||||
|
||||
def load_index_page(self,soup):
|
||||
article_list = []
|
||||
for div in soup.findAll('div',attrs={'class':re.compile('^rotator')}):
|
||||
h2 = div.h2
|
||||
if h2 is not None:
|
||||
a = h2.a
|
||||
if a is not None:
|
||||
url = a['href']
|
||||
if not self.exclude_url(url):
|
||||
if url.startswith('/'):
|
||||
url = self.newyorker_prefix+url
|
||||
byline = h2.span
|
||||
if byline is not None:
|
||||
author = self.tag_to_string(byline)
|
||||
if author.startswith('by '):
|
||||
author.replace('by ','')
|
||||
byline.extract()
|
||||
else:
|
||||
author = ''
|
||||
if h2.br is not None:
|
||||
h2.br.replaceWith(' ')
|
||||
title = self.tag_to_string(h2)
|
||||
desc = div.find(attrs={'class':['rotator-ad-body','feature-blurb-text']})
|
||||
if desc is not None:
|
||||
description = self.tag_to_string(desc)
|
||||
else:
|
||||
description = ''
|
||||
article_list.append(dict(title=title,url=url,date='',description=description,author=author,content=''))
|
||||
ul = div.find('ul','feature-blurb-links')
|
||||
if ul is not None:
|
||||
for li in ul.findAll('li'):
|
||||
a = li.a
|
||||
if a is not None:
|
||||
url = a['href']
|
||||
if not self.exclude_url(url):
|
||||
if url.startswith('/'):
|
||||
url = self.newyorker_prefix+url
|
||||
if a.br is not None:
|
||||
a.br.replaceWith(' ')
|
||||
title = '>>'+self.tag_to_string(a)
|
||||
article_list.append(dict(title=title,url=url,date='',description='',author='',content=''))
|
||||
for h3 in soup.findAll('h3','header'):
|
||||
a = h3.a
|
||||
if a is not None:
|
||||
url = a['href']
|
||||
if not self.exclude_url(url):
|
||||
if url.startswith('/'):
|
||||
url = self.newyorker_prefix+url
|
||||
byline = h3.span
|
||||
if byline is not None:
|
||||
author = self.tag_to_string(byline)
|
||||
if author.startswith('by '):
|
||||
author = author.replace('by ','')
|
||||
byline.extract()
|
||||
else:
|
||||
author = ''
|
||||
if h3.br is not None:
|
||||
h3.br.replaceWith(' ')
|
||||
title = self.tag_to_string(h3).strip()
|
||||
article_list.append(dict(title=title,url=url,date='',description='',author=author,content=''))
|
||||
return article_list
|
||||
|
||||
def load_global_section(self,securl):
|
||||
article_list = []
|
||||
try:
|
||||
soup = self.index_to_soup(securl)
|
||||
except:
|
||||
return article_list
|
||||
if '/blogs/' not in securl:
|
||||
return self.load_index_page(soup)
|
||||
for div in soup.findAll('div',attrs={'id':re.compile('^entry')}):
|
||||
h3 = div.h3
|
||||
if h3 is not None:
|
||||
a = h3.a
|
||||
if a is not None:
|
||||
url = a['href']
|
||||
if not self.exclude_url(url):
|
||||
if url.startswith('/'):
|
||||
url = self.newyorker_prefix+url
|
||||
if h3.br is not None:
|
||||
h3.br.replaceWith(' ')
|
||||
title = self.tag_to_string(h3)
|
||||
article_list.append(dict(title=title,url=url,date='',description='',author='',content=''))
|
||||
return article_list
|
||||
|
||||
def filter_ans(self, ans) :
|
||||
total_article_count = 0
|
||||
idx = 0
|
||||
idx_max = len(ans)-1
|
||||
while idx <= idx_max:
|
||||
if True: #self.verbose
|
||||
self.log("Section %s: %d articles" % (ans[idx][0], len(ans[idx][1])) )
|
||||
for article in ans[idx][1]:
|
||||
total_article_count += 1
|
||||
if True: #self.verbose
|
||||
self.log("\t%-40.40s... \t%-60.60s..." % (article['title'].encode('cp1252','replace'),
|
||||
article['url'].replace('http://m.newyorker.com','').encode('cp1252','replace')))
|
||||
idx = idx+1
|
||||
self.log( "Queued %d articles" % total_article_count )
|
||||
return ans
|
||||
|
||||
|
||||
def parse_index(self):
|
||||
ans = []
|
||||
try:
|
||||
soup = self.index_to_soup(self.newyorker_prefix)
|
||||
except:
|
||||
return ans
|
||||
seclist = self.load_global_nav(soup)
|
||||
ans.append(('Front Page',self.load_index_page(soup)))
|
||||
for (sectitle,securl) in seclist:
|
||||
ans.append((sectitle,self.load_global_section(securl)))
|
||||
return self.filter_ans(ans)
|
||||
|
||||
|
70
recipes/sportowefakty.recipe
Normal file
@ -0,0 +1,70 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
|
||||
import re
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
from calibre.utils.magick import Image
|
||||
|
||||
class sportowefakty(BasicNewsRecipe):
|
||||
title = u'SportoweFakty'
|
||||
__author__ = 'Artur Stachecki <artur.stachecki@gmail.com>, Tomasz Długosz <tomek3d@gmail.com>'
|
||||
language = 'pl'
|
||||
description = u'Najważniejsze informacje sportowe z kraju i ze świata, relacje, komentarze, wywiady, zdjęcia!'
|
||||
oldest_article = 1
|
||||
masthead_url='http://www.sportowefakty.pl/images/logo.png'
|
||||
max_articles_per_feed = 100
|
||||
simultaneous_downloads = 5
|
||||
use_embedded_content=False
|
||||
remove_javascript=True
|
||||
no_stylesheets=True
|
||||
ignore_duplicate_articles = {'title', 'url'}
|
||||
|
||||
keep_only_tags = [dict(attrs = {'class' : 'box-article'})]
|
||||
remove_tags =[]
|
||||
remove_tags.append(dict(attrs = {'class' : re.compile(r'^newsStream')}))
|
||||
remove_tags.append(dict(attrs = {'target' : '_blank'}))
|
||||
|
||||
feeds = [
|
||||
(u'Piłka Nożna', u'http://www.sportowefakty.pl/pilka-nozna/index.rss'),
|
||||
(u'Koszykówka', u'http://www.sportowefakty.pl/koszykowka/index.rss'),
|
||||
(u'Żużel', u'http://www.sportowefakty.pl/zuzel/index.rss'),
|
||||
(u'Siatkówka', u'http://www.sportowefakty.pl/siatkowka/index.rss'),
|
||||
(u'Zimowe', u'http://www.sportowefakty.pl/zimowe/index.rss'),
|
||||
(u'Hokej', u'http://www.sportowefakty.pl/hokej/index.rss'),
|
||||
(u'Moto', u'http://www.sportowefakty.pl/moto/index.rss'),
|
||||
(u'Tenis', u'http://www.sportowefakty.pl/tenis/index.rss')
|
||||
]
|
||||
|
||||
def get_article_url(self, article):
|
||||
link = article.get('link', None)
|
||||
if 'utm_source' in link:
|
||||
return link.split('?utm')[0]
|
||||
else:
|
||||
return link
|
||||
|
||||
def print_version(self, url):
|
||||
print_url = url + '/drukuj'
|
||||
return print_url
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
head = soup.find('h1')
|
||||
if 'Fotorelacja' in self.tag_to_string(head):
|
||||
return None
|
||||
else:
|
||||
for alink in soup.findAll('a'):
|
||||
if alink.string is not None:
|
||||
tstr = alink.string
|
||||
alink.replaceWith(tstr)
|
||||
return soup
|
||||
|
||||
def postprocess_html(self, soup, first):
|
||||
for tag in soup.findAll(lambda tag: tag.name.lower()=='img' and tag.has_key('src')):
|
||||
iurl = tag['src']
|
||||
img = Image()
|
||||
img.open(iurl)
|
||||
if img < 0:
|
||||
raise RuntimeError('Out of memory')
|
||||
img.type = "GrayscaleType"
|
||||
img.save(iurl)
|
||||
return soup
|
@ -36,47 +36,21 @@ class TheOnion(BasicNewsRecipe):
|
||||
, 'publisher': publisher
|
||||
, 'language' : language
|
||||
}
|
||||
|
||||
keep_only_tags = [
|
||||
dict(name='h2', attrs={'class':['section_title','title']})
|
||||
,dict(attrs={'class':['main_image','meta','article_photo_lead','article_body']})
|
||||
,dict(attrs={'id':['entries']})
|
||||
]
|
||||
remove_attributes=['lang','rel']
|
||||
remove_tags_after = dict(attrs={'class':['article_body','feature_content']})
|
||||
keep_only_tags = [dict(name='article', attrs={'class':'full-article'})]
|
||||
remove_tags = [
|
||||
dict(name=['object','link','iframe','base','meta'])
|
||||
,dict(name='div', attrs={'class':['toolbar_side','graphical_feature','toolbar_bottom']})
|
||||
,dict(name='div', attrs={'id':['recent_slider','sidebar','pagination','related_media']})
|
||||
]
|
||||
|
||||
dict(name=['nav', 'aside', 'section', 'meta']),
|
||||
{'attrs':{'class':lambda x: x and ('share-tools' in x or 'ad-zone' in x)}},
|
||||
]
|
||||
|
||||
feeds = [
|
||||
(u'Daily' , u'http://feeds.theonion.com/theonion/daily' )
|
||||
,(u'Sports' , u'http://feeds.theonion.com/theonion/sports' )
|
||||
]
|
||||
|
||||
def get_article_url(self, article):
|
||||
artl = BasicNewsRecipe.get_article_url(self, article)
|
||||
if artl.startswith('http://www.theonion.com/audio/'):
|
||||
artl = None
|
||||
return artl
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
for item in soup.findAll(style=True):
|
||||
del item['style']
|
||||
for item in soup.findAll('a'):
|
||||
limg = item.find('img')
|
||||
if item.string is not None:
|
||||
str = item.string
|
||||
item.replaceWith(str)
|
||||
else:
|
||||
if limg:
|
||||
item.name = 'div'
|
||||
item.attrs = []
|
||||
if not limg.has_key('alt'):
|
||||
limg['alt'] = 'image'
|
||||
else:
|
||||
str = self.tag_to_string(item)
|
||||
item.replaceWith(str)
|
||||
def preprocess_html(self, soup, *args):
|
||||
for img in soup.findAll('img', attrs={'data-src':True}):
|
||||
if img['data-src']:
|
||||
img['src'] = img['data-src']
|
||||
return soup
|
||||
|
||||
|
||||
|
17
recipes/universe_today.recipe
Normal file
@ -0,0 +1,17 @@
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class UniverseToday(BasicNewsRecipe):
|
||||
title = u'Universe Today'
|
||||
language = 'en'
|
||||
description = u'Space and astronomy news.'
|
||||
__author__ = 'seird'
|
||||
publisher = u'universetoday.com'
|
||||
category = 'science, astronomy, news, rss'
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 40
|
||||
auto_cleanup = True
|
||||
no_stylesheets = True
|
||||
use_embedded_content = False
|
||||
remove_empty_feeds = True
|
||||
|
||||
feeds = [(u'Universe Today', u'http://feeds.feedburner.com/universetoday/pYdq')]
|
@ -6,17 +6,62 @@ __license__ = 'GPL v3'
|
||||
www.canada.com
|
||||
'''
|
||||
import re
|
||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
from calibre.ebooks.BeautifulSoup import Tag, BeautifulStoneSoup
|
||||
|
||||
|
||||
class TimesColonist(BasicNewsRecipe):
|
||||
|
||||
# Customization -- remove sections you don't want.
|
||||
# If your e-reader is an e-ink Kindle and your output profile is
|
||||
# set properly this recipe will not include images because the
|
||||
# resulting file is too large. If you have one of these and want
|
||||
# images you can set kindle_omit_images = False
|
||||
# and remove sections (typically the e-ink Kindles will
|
||||
# work with about a dozen of these, but your mileage may vary).
|
||||
|
||||
kindle_omit_images = True
|
||||
|
||||
section_list = [
|
||||
('','Web Front Page'),
|
||||
('news/','News Headlines'),
|
||||
('news/b-c/','BC News'),
|
||||
('news/national/','National News'),
|
||||
('news/world/','World News'),
|
||||
('opinion/','Opinion'),
|
||||
('opinion/letters/','Letters'),
|
||||
('business/','Business'),
|
||||
('business/money/','Money'),
|
||||
('business/technology/','Technology'),
|
||||
('business/working/','Working'),
|
||||
('sports/','Sports'),
|
||||
('sports/hockey/','Hockey'),
|
||||
('sports/football/','Football'),
|
||||
('sports/basketball/','Basketball'),
|
||||
('sports/golf/','Golf'),
|
||||
('entertainment/','entertainment'),
|
||||
('entertainment/go/','Go!'),
|
||||
('entertainment/music/','Music'),
|
||||
('entertainment/books/','Books'),
|
||||
('entertainment/Movies/','Movies'),
|
||||
('entertainment/television/','Television'),
|
||||
('life/','Life'),
|
||||
('life/health/','Health'),
|
||||
('life/travel/','Travel'),
|
||||
('life/driving/','Driving'),
|
||||
('life/homes/','Homes'),
|
||||
('life/food-drink/','Food & Drink')
|
||||
]
|
||||
|
||||
title = u'Victoria Times Colonist'
|
||||
url_prefix = 'http://www.timescolonist.com'
|
||||
description = u'News from Victoria, BC'
|
||||
fp_tag = 'CAN_TC'
|
||||
|
||||
masthead_url = 'http://www.timescolonist.com/gmg/img/global/logoTimesColonist.png'
|
||||
|
||||
|
||||
url_list = []
|
||||
language = 'en_CA'
|
||||
__author__ = 'Nick Redding'
|
||||
@ -29,15 +74,21 @@ class TimesColonist(BasicNewsRecipe):
|
||||
.caption { font-size: xx-small; font-style: italic; font-weight: normal; }
|
||||
'''
|
||||
keep_only_tags = [dict(name='div', attrs={'class':re.compile('main.content')})]
|
||||
remove_tags = [{'class':'comments'},
|
||||
{'id':'photocredit'},
|
||||
dict(name='div', attrs={'class':re.compile('top.controls')}),
|
||||
dict(name='div', attrs={'class':re.compile('social')}),
|
||||
dict(name='div', attrs={'class':re.compile('tools')}),
|
||||
dict(name='div', attrs={'class':re.compile('bottom.tools')}),
|
||||
dict(name='div', attrs={'class':re.compile('window')}),
|
||||
dict(name='div', attrs={'class':re.compile('related.news.element')})]
|
||||
|
||||
def __init__(self, options, log, progress_reporter):
|
||||
self.remove_tags = [{'class':'comments'},
|
||||
{'id':'photocredit'},
|
||||
dict(name='div', attrs={'class':re.compile('top.controls')}),
|
||||
dict(name='div', attrs={'class':re.compile('^comments')}),
|
||||
dict(name='div', attrs={'class':re.compile('social')}),
|
||||
dict(name='div', attrs={'class':re.compile('tools')}),
|
||||
dict(name='div', attrs={'class':re.compile('bottom.tools')}),
|
||||
dict(name='div', attrs={'class':re.compile('window')}),
|
||||
dict(name='div', attrs={'class':re.compile('related.news.element')})]
|
||||
print("PROFILE NAME = "+options.output_profile.short_name)
|
||||
if self.kindle_omit_images and options.output_profile.short_name in ['kindle', 'kindle_dx', 'kindle_pw']:
|
||||
self.remove_tags.append(dict(name='div', attrs={'class':re.compile('image-container')}))
|
||||
BasicNewsRecipe.__init__(self, options, log, progress_reporter)
|
||||
|
||||
def get_cover_url(self):
|
||||
from datetime import timedelta, date
|
||||
@ -122,7 +173,6 @@ class TimesColonist(BasicNewsRecipe):
|
||||
def preprocess_html(self,soup):
|
||||
byline = soup.find('p',attrs={'class':re.compile('ancillary')})
|
||||
if byline is not None:
|
||||
byline.find('a')
|
||||
authstr = self.tag_to_string(byline,False)
|
||||
authstr = re.sub('/ *Times Colonist','/',authstr, flags=re.IGNORECASE)
|
||||
authstr = re.sub('BY */','',authstr, flags=re.IGNORECASE)
|
||||
@ -149,9 +199,10 @@ class TimesColonist(BasicNewsRecipe):
|
||||
atag = htag.a
|
||||
if atag is not None:
|
||||
url = atag['href']
|
||||
#print("Checking "+url)
|
||||
if atag['href'].startswith('/'):
|
||||
url = self.url_prefix+atag['href']
|
||||
url = url.strip()
|
||||
# print("Checking >>"+url+'<<\n\r')
|
||||
if url.startswith('/'):
|
||||
url = self.url_prefix+url
|
||||
if url in self.url_list:
|
||||
return
|
||||
self.url_list.append(url)
|
||||
@ -171,10 +222,10 @@ class TimesColonist(BasicNewsRecipe):
|
||||
if dtag is not None:
|
||||
description = self.tag_to_string(dtag,False)
|
||||
article_list.append(dict(title=title,url=url,date='',description=description,author='',content=''))
|
||||
#print(sectitle+title+": description = "+description+" URL="+url)
|
||||
print(sectitle+title+": description = "+description+" URL="+url+'\n\r')
|
||||
|
||||
def add_section_index(self,ans,securl,sectitle):
|
||||
print("Add section url="+self.url_prefix+'/'+securl)
|
||||
print("Add section url="+self.url_prefix+'/'+securl+'\n\r')
|
||||
try:
|
||||
soup = self.index_to_soup(self.url_prefix+'/'+securl)
|
||||
except:
|
||||
@ -193,33 +244,7 @@ class TimesColonist(BasicNewsRecipe):
|
||||
|
||||
def parse_index(self):
|
||||
ans = []
|
||||
ans = self.add_section_index(ans,'','Web Front Page')
|
||||
ans = self.add_section_index(ans,'news/','News Headlines')
|
||||
ans = self.add_section_index(ans,'news/b-c/','BC News')
|
||||
ans = self.add_section_index(ans,'news/national/','Natioanl News')
|
||||
ans = self.add_section_index(ans,'news/world/','World News')
|
||||
ans = self.add_section_index(ans,'opinion/','Opinion')
|
||||
ans = self.add_section_index(ans,'opinion/letters/','Letters')
|
||||
ans = self.add_section_index(ans,'business/','Business')
|
||||
ans = self.add_section_index(ans,'business/money/','Money')
|
||||
ans = self.add_section_index(ans,'business/technology/','Technology')
|
||||
ans = self.add_section_index(ans,'business/working/','Working')
|
||||
ans = self.add_section_index(ans,'sports/','Sports')
|
||||
ans = self.add_section_index(ans,'sports/hockey/','Hockey')
|
||||
ans = self.add_section_index(ans,'sports/football/','Football')
|
||||
ans = self.add_section_index(ans,'sports/basketball/','Basketball')
|
||||
ans = self.add_section_index(ans,'sports/golf/','Golf')
|
||||
ans = self.add_section_index(ans,'entertainment/','entertainment')
|
||||
ans = self.add_section_index(ans,'entertainment/go/','Go!')
|
||||
ans = self.add_section_index(ans,'entertainment/music/','Music')
|
||||
ans = self.add_section_index(ans,'entertainment/books/','Books')
|
||||
ans = self.add_section_index(ans,'entertainment/Movies/','movies')
|
||||
ans = self.add_section_index(ans,'entertainment/television/','Television')
|
||||
ans = self.add_section_index(ans,'life/','Life')
|
||||
ans = self.add_section_index(ans,'life/health/','Health')
|
||||
ans = self.add_section_index(ans,'life/travel/','Travel')
|
||||
ans = self.add_section_index(ans,'life/driving/','Driving')
|
||||
ans = self.add_section_index(ans,'life/homes/','Homes')
|
||||
ans = self.add_section_index(ans,'life/food-drink/','Food & Drink')
|
||||
for (url,title) in self.section_list:
|
||||
ans = self.add_section_index(ans,url,title)
|
||||
return ans
|
||||
|
||||
|
@ -1,144 +0,0 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||
|
||||
class GazetaWyborczaDuzyForma(BasicNewsRecipe):
|
||||
cover_url = 'http://bi.gazeta.pl/im/8/5415/m5415058.gif'
|
||||
title = u"Gazeta Wyborcza Duzy Format"
|
||||
__author__ = 'ravcio - rlelusz[at]gmail.com'
|
||||
description = u"Articles from Gazeta's website"
|
||||
language = 'pl'
|
||||
max_articles_per_feed = 50 #you can increade it event up to maybe 600, should still work
|
||||
recursions = 0
|
||||
encoding = 'iso-8859-2'
|
||||
no_stylesheets = True
|
||||
remove_javascript = True
|
||||
use_embedded_content = False
|
||||
|
||||
|
||||
keep_only_tags = [
|
||||
dict(name='div', attrs={'id':['k1']})
|
||||
]
|
||||
|
||||
remove_tags = [
|
||||
dict(name='div', attrs={'class':['zdjM', 'rel_video', 'zdjP', 'rel_box', 'index mod_zi_dolStrony']})
|
||||
,dict(name='div', attrs={'id':['source', 'banP4', 'article_toolbar', 'rel', 'inContext_disabled']})
|
||||
,dict(name='ul', attrs={'id':['articleToolbar']})
|
||||
,dict(name='img', attrs={'class':['brand']})
|
||||
,dict(name='h5', attrs={'class':['author']})
|
||||
,dict(name='h6', attrs={'class':['date']})
|
||||
,dict(name='p', attrs={'class':['txt_upl']})
|
||||
]
|
||||
|
||||
remove_tags_after = [
|
||||
dict(name='div', attrs={'id':['Str']}) #nawigator numerow linii
|
||||
]
|
||||
|
||||
def load_article_links(self, url, count):
|
||||
print '--- load_article_links', url, count
|
||||
|
||||
#page with link to articles
|
||||
soup = self.index_to_soup(url)
|
||||
|
||||
#table with articles
|
||||
list = soup.find('div', attrs={'class':'GWdalt'})
|
||||
|
||||
#single articles (link, title, ...)
|
||||
links = list.findAll('div', attrs={'class':['GWdaltE']})
|
||||
|
||||
if len(links) < count:
|
||||
#load links to more articles...
|
||||
|
||||
#remove new link
|
||||
pages_nav = list.find('div', attrs={'class':'pages'})
|
||||
next = pages_nav.find('a', attrs={'class':'next'})
|
||||
if next:
|
||||
print 'next=', next['href']
|
||||
url = 'http://wyborcza.pl' + next['href']
|
||||
#e.g. url = 'http://wyborcza.pl/0,75480.html?str=2'
|
||||
|
||||
older_links = self.load_article_links(url, count - len(links))
|
||||
links.extend(older_links)
|
||||
|
||||
return links
|
||||
|
||||
|
||||
#produce list of articles to download
|
||||
def parse_index(self):
|
||||
print '--- parse_index'
|
||||
|
||||
max_articles = 8000
|
||||
links = self.load_article_links('http://wyborcza.pl/0,75480.html', max_articles)
|
||||
|
||||
ans = []
|
||||
key = None
|
||||
articles = {}
|
||||
|
||||
key = 'Uncategorized'
|
||||
articles[key] = []
|
||||
|
||||
for div_art in links:
|
||||
div_date = div_art.find('div', attrs={'class':'kL'})
|
||||
div = div_art.find('div', attrs={'class':'kR'})
|
||||
|
||||
a = div.find('a', href=True)
|
||||
|
||||
url = a['href']
|
||||
title = a.string
|
||||
description = ''
|
||||
pubdate = div_date.string.rstrip().lstrip()
|
||||
summary = div.find('span', attrs={'class':'lead'})
|
||||
|
||||
desc = summary.find('a', href=True)
|
||||
if desc:
|
||||
desc.extract()
|
||||
|
||||
description = self.tag_to_string(summary, use_alt=False)
|
||||
description = description.rstrip().lstrip()
|
||||
|
||||
feed = key if key is not None else 'Duzy Format'
|
||||
|
||||
if not articles.has_key(feed):
|
||||
articles[feed] = []
|
||||
|
||||
if description != '': # skip just pictures atricle
|
||||
articles[feed].append(
|
||||
dict(title=title, url=url, date=pubdate,
|
||||
description=description,
|
||||
content=''))
|
||||
|
||||
ans = [(key, articles[key])]
|
||||
return ans
|
||||
|
||||
def append_page(self, soup, appendtag, position):
|
||||
pager = soup.find('div',attrs={'id':'Str'})
|
||||
if pager:
|
||||
#seek for 'a' element with nast value (if not found exit)
|
||||
list = pager.findAll('a')
|
||||
|
||||
for elem in list:
|
||||
if 'nast' in elem.string:
|
||||
nexturl = elem['href']
|
||||
|
||||
soup2 = self.index_to_soup('http://warszawa.gazeta.pl' + nexturl)
|
||||
|
||||
texttag = soup2.find('div', attrs={'id':'artykul'})
|
||||
|
||||
newpos = len(texttag.contents)
|
||||
self.append_page(soup2,texttag,newpos)
|
||||
texttag.extract()
|
||||
appendtag.insert(position,texttag)
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
self.append_page(soup, soup.body, 3)
|
||||
|
||||
# finally remove some tags
|
||||
pager = soup.find('div',attrs={'id':'Str'})
|
||||
if pager:
|
||||
pager.extract()
|
||||
|
||||
pager = soup.find('div',attrs={'class':'tylko_int'})
|
||||
if pager:
|
||||
pager.extract()
|
||||
|
||||
return soup
|
57
recipes/wysokie_obcasy.recipe
Normal file
@ -0,0 +1,57 @@
|
||||
#!/usr/bin/env python
|
||||
__license__ = 'GPL v3'
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class WysokieObcasyRecipe(BasicNewsRecipe):
|
||||
__author__ = u'Artur Stachecki <artur.stachecki@gmail.com>'
|
||||
language = 'pl'
|
||||
version = 1
|
||||
|
||||
title = u'Wysokie Obcasy'
|
||||
publisher = 'Agora SA'
|
||||
description = u'Serwis sobotniego dodatku do Gazety Wyborczej'
|
||||
category='magazine'
|
||||
language = 'pl'
|
||||
publication_type = 'magazine'
|
||||
cover_url=''
|
||||
remove_empty_feeds= True
|
||||
no_stylesheets=True
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 100000
|
||||
recursions = 0
|
||||
|
||||
no_stylesheets = True
|
||||
remove_javascript = True
|
||||
simultaneous_downloads = 5
|
||||
|
||||
keep_only_tags =[]
|
||||
keep_only_tags.append(dict(name = 'div', attrs = {'id' : 'article'}))
|
||||
|
||||
remove_tags =[]
|
||||
remove_tags.append(dict(name = 'img'))
|
||||
remove_tags.append(dict(name = 'p', attrs = {'class' : 'info'}))
|
||||
|
||||
extra_css = '''
|
||||
body {font-family: verdana, arial, helvetica, geneva, sans-serif ;}
|
||||
h1{text-align: left;}
|
||||
'''
|
||||
|
||||
feeds = [
|
||||
('Wszystkie Artykuly', 'feed://www.wysokieobcasy.pl/pub/rss/wysokieobcasy.xml'),
|
||||
]
|
||||
|
||||
def print_version(self,url):
|
||||
baseURL='http://www.wysokieobcasy.pl/wysokie-obcasy'
|
||||
segments = url.split(',')
|
||||
subPath= '/2029020,'
|
||||
articleURL1 = segments[1]
|
||||
articleURL2 = segments[2]
|
||||
printVerString=articleURL1 + ',' + articleURL2
|
||||
s= baseURL + subPath + printVerString + '.html'
|
||||
return s
|
||||
|
||||
def get_cover_url(self):
|
||||
soup = self.index_to_soup('http://www.wysokieobcasy.pl/wysokie-obcasy/0,0.html')
|
||||
self.cover_url = soup.find(attrs={'class':'holder_cr'}).find('img')['src']
|
||||
return getattr(self, 'cover_url', self.cover_url)
|
@ -357,7 +357,7 @@
|
||||
<xsl:apply-templates/>
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template match="rtf:table">
|
||||
<xsl:template match="rtf:table">
|
||||
<xsl:element name="table">
|
||||
<xsl:attribute name="id">
|
||||
<xsl:value-of select="generate-id(.)"/>
|
||||
@ -390,7 +390,6 @@
|
||||
|
||||
|
||||
<xsl:output method = "xml"/>
|
||||
|
||||
<xsl:key name="style-types" match="rtf:paragraph-definition" use="@style-number"/>
|
||||
|
||||
|
||||
@ -415,13 +414,11 @@
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template match="rtf:page-break">
|
||||
<xsl:element name="br">
|
||||
<xsl:attribute name="style">page-break-after:always</xsl:attribute>
|
||||
</xsl:element>
|
||||
<br style = "page-break-after:always"/>
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template match="rtf:hardline-break">
|
||||
<xsl:element name="br"/>
|
||||
<br/>
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template match="rtf:rtf-definition|rtf:font-table|rtf:color-table|rtf:style-table|rtf:page-definition|rtf:list-table|rtf:override-table|rtf:override-list|rtf:list-text"/>
|
||||
@ -445,7 +442,7 @@
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template match = "rtf:field-block">
|
||||
<xsl:apply-templates/>
|
||||
<xsl:apply-templates/>
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template match = "rtf:field[@type='hyperlink']">
|
||||
@ -472,9 +469,7 @@
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template match="rtf:pict">
|
||||
<xsl:element name="img">
|
||||
<xsl:attribute name="src"><xsl:value-of select="@num" /></xsl:attribute>
|
||||
</xsl:element>
|
||||
<img src = "{@num}"/>
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template match="*">
|
||||
|
@ -4,7 +4,7 @@ __license__ = 'GPL v3'
|
||||
__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
__appname__ = u'calibre'
|
||||
numeric_version = (0, 9, 25)
|
||||
numeric_version = (0, 9, 26)
|
||||
__version__ = u'.'.join(map(unicode, numeric_version))
|
||||
__author__ = u"Kovid Goyal <kovid@kovidgoyal.net>"
|
||||
|
||||
|
@ -757,9 +757,10 @@ from calibre.ebooks.metadata.sources.isbndb import ISBNDB
|
||||
from calibre.ebooks.metadata.sources.overdrive import OverDrive
|
||||
from calibre.ebooks.metadata.sources.douban import Douban
|
||||
from calibre.ebooks.metadata.sources.ozon import Ozon
|
||||
# from calibre.ebooks.metadata.sources.google_images import GoogleImages
|
||||
from calibre.ebooks.metadata.sources.google_images import GoogleImages
|
||||
from calibre.ebooks.metadata.sources.big_book_search import BigBookSearch
|
||||
|
||||
plugins += [GoogleBooks, Amazon, Edelweiss, OpenLibrary, ISBNDB, OverDrive, Douban, Ozon]
|
||||
plugins += [GoogleBooks, GoogleImages, Amazon, Edelweiss, OpenLibrary, ISBNDB, OverDrive, Douban, Ozon, BigBookSearch]
|
||||
|
||||
# }}}
|
||||
|
||||
|
@ -91,7 +91,7 @@ def restore_plugin_state_to_default(plugin_or_name):
|
||||
config['enabled_plugins'] = ep
|
||||
|
||||
default_disabled_plugins = set([
|
||||
'Overdrive', 'Douban Books', 'OZON.ru', 'Edelweiss', 'Google Images',
|
||||
'Overdrive', 'Douban Books', 'OZON.ru', 'Edelweiss', 'Google Images', 'Big Book Search',
|
||||
])
|
||||
|
||||
def is_disabled(plugin):
|
||||
|
@ -132,7 +132,7 @@ class Worker(Thread): # Get details {{{
|
||||
text()="Détails sur le produit" or \
|
||||
text()="Detalles del producto" or \
|
||||
text()="Detalhes do produto" or \
|
||||
text()="登録情報"]/../div[@class="content"]
|
||||
starts-with(text(), "登録情報")]/../div[@class="content"]
|
||||
'''
|
||||
# Editor: is for Spanish
|
||||
self.publisher_xpath = '''
|
||||
@ -235,6 +235,12 @@ class Worker(Thread): # Get details {{{
|
||||
msg = 'Failed to parse amazon details page: %r'%self.url
|
||||
self.log.exception(msg)
|
||||
return
|
||||
if self.domain == 'jp':
|
||||
for a in root.xpath('//a[@href]'):
|
||||
if 'black-curtain-redirect.html' in a.get('href'):
|
||||
self.url = 'http://amazon.co.jp'+a.get('href')
|
||||
self.log('Black curtain redirect found, following')
|
||||
return self.get_details()
|
||||
|
||||
errmsg = root.xpath('//*[@id="errorMessage"]')
|
||||
if errmsg:
|
||||
@ -252,8 +258,8 @@ class Worker(Thread): # Get details {{{
|
||||
self.log.exception('Error parsing asin for url: %r'%self.url)
|
||||
asin = None
|
||||
if self.testing:
|
||||
import tempfile
|
||||
with tempfile.NamedTemporaryFile(prefix=asin + '_',
|
||||
import tempfile, uuid
|
||||
with tempfile.NamedTemporaryFile(prefix=(asin or str(uuid.uuid4()))+ '_',
|
||||
suffix='.html', delete=False) as f:
|
||||
f.write(raw)
|
||||
print ('Downloaded html for', asin, 'saved in', f.name)
|
||||
@ -499,7 +505,7 @@ class Worker(Thread): # Get details {{{
|
||||
def parse_language(self, pd):
|
||||
for x in reversed(pd.xpath(self.language_xpath)):
|
||||
if x.tail:
|
||||
raw = x.tail.strip()
|
||||
raw = x.tail.strip().partition(',')[0].strip()
|
||||
ans = self.lang_map.get(raw, None)
|
||||
if ans:
|
||||
return ans
|
||||
@ -1004,6 +1010,11 @@ if __name__ == '__main__': # tests {{{
|
||||
] # }}}
|
||||
|
||||
jp_tests = [ # {{{
|
||||
( # Adult filtering test
|
||||
{'identifiers':{'isbn':'4799500066'}},
|
||||
[title_test(u'Bitch Trap'),]
|
||||
),
|
||||
|
||||
( # isbn -> title, authors
|
||||
{'identifiers':{'isbn': '9784101302720' }},
|
||||
[title_test(u'精霊の守り人',
|
||||
|
@ -31,7 +31,7 @@ msprefs.defaults['find_first_edition_date'] = False
|
||||
# Google covers are often poor quality (scans/errors) but they have high
|
||||
# resolution, so they trump covers from better sources. So make sure they
|
||||
# are only used if no other covers are found.
|
||||
msprefs.defaults['cover_priorities'] = {'Google':2, 'Google Images':2}
|
||||
msprefs.defaults['cover_priorities'] = {'Google':2, 'Google Images':2, 'Big Book Search':2}
|
||||
|
||||
def create_log(ostream=None):
|
||||
from calibre.utils.logging import ThreadSafeLog, FileStream
|
||||
@ -429,6 +429,40 @@ class Source(Plugin):
|
||||
mi.tags = list(map(fixcase, mi.tags))
|
||||
mi.isbn = check_isbn(mi.isbn)
|
||||
|
||||
def download_multiple_covers(self, title, authors, urls, get_best_cover, timeout, result_queue, abort, log, prefs_name='max_covers'):
|
||||
if not urls:
|
||||
log('No images found for, title: %r and authors: %r'%(title, authors))
|
||||
return
|
||||
from threading import Thread
|
||||
import time
|
||||
if prefs_name:
|
||||
urls = urls[:self.prefs[prefs_name]]
|
||||
if get_best_cover:
|
||||
urls = urls[:1]
|
||||
log('Downloading %d covers'%len(urls))
|
||||
workers = [Thread(target=self.download_image, args=(u, timeout, log, result_queue)) for u in urls]
|
||||
for w in workers:
|
||||
w.daemon = True
|
||||
w.start()
|
||||
alive = True
|
||||
start_time = time.time()
|
||||
while alive and not abort.is_set() and time.time() - start_time < timeout:
|
||||
alive = False
|
||||
for w in workers:
|
||||
if w.is_alive():
|
||||
alive = True
|
||||
break
|
||||
abort.wait(0.1)
|
||||
|
||||
def download_image(self, url, timeout, log, result_queue):
|
||||
try:
|
||||
ans = self.browser.open_novisit(url, timeout=timeout).read()
|
||||
result_queue.put((self, ans))
|
||||
log('Downloaded cover from: %s'%url)
|
||||
except Exception:
|
||||
self.log.exception('Failed to download cover from: %r'%url)
|
||||
|
||||
|
||||
# }}}
|
||||
|
||||
# Metadata API {{{
|
||||
|
58
src/calibre/ebooks/metadata/sources/big_book_search.py
Normal file
@ -0,0 +1,58 @@
|
||||
#!/usr/bin/env python
|
||||
# vim:fileencoding=UTF-8
|
||||
from __future__ import (unicode_literals, division, absolute_import,
|
||||
print_function)
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2013, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
from calibre.ebooks.metadata.sources.base import Source, Option
|
||||
|
||||
def get_urls(br, tokens):
|
||||
from urllib import quote_plus
|
||||
from mechanize import Request
|
||||
from lxml import html
|
||||
escaped = [quote_plus(x.encode('utf-8')) for x in tokens if x and x.strip()]
|
||||
q = b'+'.join(escaped)
|
||||
url = 'http://bigbooksearch.com/books/'+q
|
||||
br.open(url).read()
|
||||
req = Request('http://bigbooksearch.com/query.php?SearchIndex=books&Keywords=%s&ItemPage=1'%q)
|
||||
req.add_header('X-Requested-With', 'XMLHttpRequest')
|
||||
req.add_header('Referer', url)
|
||||
raw = br.open(req).read()
|
||||
root = html.fromstring(raw.decode('utf-8'))
|
||||
urls = [i.get('src') for i in root.xpath('//img[@src]')]
|
||||
return urls
|
||||
|
||||
class BigBookSearch(Source):
|
||||
|
||||
name = 'Big Book Search'
|
||||
description = _('Downloads multiple book covers from Amazon. Useful to find alternate covers.')
|
||||
capabilities = frozenset(['cover'])
|
||||
config_help_message = _('Configure the Big Book Search plugin')
|
||||
can_get_multiple_covers = True
|
||||
options = (Option('max_covers', 'number', 5, _('Maximum number of covers to get'),
|
||||
_('The maximum number of covers to process from the search result')),
|
||||
)
|
||||
supports_gzip_transfer_encoding = True
|
||||
|
||||
def download_cover(self, log, result_queue, abort,
|
||||
title=None, authors=None, identifiers={}, timeout=30, get_best_cover=False):
|
||||
if not title:
|
||||
return
|
||||
br = self.browser
|
||||
tokens = tuple(self.get_title_tokens(title)) + tuple(self.get_author_tokens(authors))
|
||||
urls = get_urls(br, tokens)
|
||||
self.download_multiple_covers(title, authors, urls, get_best_cover, timeout, result_queue, abort, log)
|
||||
|
||||
def test():
|
||||
from calibre import browser
|
||||
import pprint
|
||||
br = browser()
|
||||
urls = get_urls(br, ['consider', 'phlebas', 'banks'])
|
||||
pprint.pprint(urls)
|
||||
|
||||
if __name__ == '__main__':
|
||||
test()
|
||||
|
@ -18,12 +18,13 @@ from calibre.utils.magick.draw import Image, save_cover_data_to
|
||||
|
||||
class Worker(Thread):
|
||||
|
||||
def __init__(self, plugin, abort, title, authors, identifiers, timeout, rq):
|
||||
def __init__(self, plugin, abort, title, authors, identifiers, timeout, rq, get_best_cover=False):
|
||||
Thread.__init__(self)
|
||||
self.daemon = True
|
||||
|
||||
self.plugin = plugin
|
||||
self.abort = abort
|
||||
self.get_best_cover = get_best_cover
|
||||
self.buf = BytesIO()
|
||||
self.log = create_log(self.buf)
|
||||
self.title, self.authors, self.identifiers = (title, authors,
|
||||
@ -37,7 +38,7 @@ class Worker(Thread):
|
||||
try:
|
||||
if self.plugin.can_get_multiple_covers:
|
||||
self.plugin.download_cover(self.log, self.rq, self.abort,
|
||||
title=self.title, authors=self.authors, get_best_cover=True,
|
||||
title=self.title, authors=self.authors, get_best_cover=self.get_best_cover,
|
||||
identifiers=self.identifiers, timeout=self.timeout)
|
||||
else:
|
||||
self.plugin.download_cover(self.log, self.rq, self.abort,
|
||||
@ -72,7 +73,7 @@ def process_result(log, result):
|
||||
return (plugin, width, height, fmt, data)
|
||||
|
||||
def run_download(log, results, abort,
|
||||
title=None, authors=None, identifiers={}, timeout=30):
|
||||
title=None, authors=None, identifiers={}, timeout=30, get_best_cover=False):
|
||||
'''
|
||||
Run the cover download, putting results into the queue :param:`results`.
|
||||
|
||||
@ -89,7 +90,7 @@ def run_download(log, results, abort,
|
||||
plugins = [p for p in metadata_plugins(['cover']) if p.is_configured()]
|
||||
|
||||
rq = Queue()
|
||||
workers = [Worker(p, abort, title, authors, identifiers, timeout, rq) for p
|
||||
workers = [Worker(p, abort, title, authors, identifiers, timeout, rq, get_best_cover=get_best_cover) for p
|
||||
in plugins]
|
||||
for w in workers:
|
||||
w.start()
|
||||
@ -163,7 +164,7 @@ def download_cover(log,
|
||||
abort = Event()
|
||||
|
||||
run_download(log, rq, abort, title=title, authors=authors,
|
||||
identifiers=identifiers, timeout=timeout)
|
||||
identifiers=identifiers, timeout=timeout, get_best_cover=True)
|
||||
|
||||
results = []
|
||||
|
||||
|
@ -106,6 +106,8 @@ class Worker(Thread): # {{{
|
||||
parts = pub.partition(':')[0::2]
|
||||
pub = parts[1] or parts[0]
|
||||
try:
|
||||
if ', Ship Date:' in pub:
|
||||
pub = pub.partition(', Ship Date:')[0]
|
||||
q = parse_only_date(pub, assume_utc=True)
|
||||
if q.year != UNDEFINED_DATE:
|
||||
mi.pubdate = q
|
||||
|
@ -39,39 +39,11 @@ class GoogleImages(Source):
|
||||
title=None, authors=None, identifiers={}, timeout=30, get_best_cover=False):
|
||||
if not title:
|
||||
return
|
||||
from threading import Thread
|
||||
import time
|
||||
timeout = max(60, timeout) # Needs at least a minute
|
||||
title = ' '.join(self.get_title_tokens(title))
|
||||
author = ' '.join(self.get_author_tokens(authors))
|
||||
urls = self.get_image_urls(title, author, log, abort, timeout)
|
||||
if not urls:
|
||||
log('No images found in Google for, title: %r and authors: %r'%(title, author))
|
||||
return
|
||||
urls = urls[:self.prefs['max_covers']]
|
||||
if get_best_cover:
|
||||
urls = urls[:1]
|
||||
workers = [Thread(target=self.download_image, args=(url, timeout, log, result_queue)) for url in urls]
|
||||
for w in workers:
|
||||
w.daemon = True
|
||||
w.start()
|
||||
alive = True
|
||||
start_time = time.time()
|
||||
while alive and not abort.is_set() and time.time() - start_time < timeout:
|
||||
alive = False
|
||||
for w in workers:
|
||||
if w.is_alive():
|
||||
alive = True
|
||||
break
|
||||
abort.wait(0.1)
|
||||
|
||||
def download_image(self, url, timeout, log, result_queue):
|
||||
try:
|
||||
ans = self.browser.open_novisit(url, timeout=timeout).read()
|
||||
result_queue.put((self, ans))
|
||||
log('Downloaded cover from: %s'%url)
|
||||
except Exception:
|
||||
self.log.exception('Failed to download cover from: %r'%url)
|
||||
self.download_multiple_covers(title, authors, urls, get_best_cover, timeout, result_queue, abort, log)
|
||||
|
||||
def get_image_urls(self, title, author, log, abort, timeout):
|
||||
from calibre.utils.ipc.simple_worker import fork_job, WorkerError
|
||||
|
@ -180,5 +180,6 @@ class BorderParse:
|
||||
elif 'single' in border_style_list:
|
||||
new_border_dict[att] = 'single'
|
||||
else:
|
||||
new_border_dict[att] = border_style_list[0]
|
||||
if border_style_list:
|
||||
new_border_dict[att] = border_style_list[0]
|
||||
return new_border_dict
|
||||
|
@ -559,11 +559,11 @@ class TOCView(QWidget): # {{{
|
||||
b.setToolTip(_('Remove all selected entries'))
|
||||
b.clicked.connect(self.del_items)
|
||||
|
||||
self.left_button = b = QToolButton(self)
|
||||
self.right_button = b = QToolButton(self)
|
||||
b.setIcon(QIcon(I('forward.png')))
|
||||
b.setIconSize(QSize(ICON_SIZE, ICON_SIZE))
|
||||
l.addWidget(b, 4, 3)
|
||||
b.setToolTip(_('Unindent the current entry [Ctrl+Left]'))
|
||||
b.setToolTip(_('Indent the current entry [Ctrl+Right]'))
|
||||
b.clicked.connect(self.tocw.move_right)
|
||||
|
||||
self.down_button = b = QToolButton(self)
|
||||
|
@ -54,7 +54,7 @@ def get_parser(usage):
|
||||
def get_db(dbpath, options):
|
||||
global do_notify
|
||||
if options.library_path is not None:
|
||||
dbpath = options.library_path
|
||||
dbpath = os.path.expanduser(options.library_path)
|
||||
if dbpath is None:
|
||||
raise ValueError('No saved library path, either run the GUI or use the'
|
||||
' --with-library option')
|
||||
|
@ -174,7 +174,13 @@ def _extractall(f, path=None, file_info=None):
|
||||
has_data_descriptors = header.flags & (1 << 3)
|
||||
seekval = header.compressed_size + (16 if has_data_descriptors else 0)
|
||||
found = True
|
||||
parts = header.filename.split('/')
|
||||
# Sanitize path changing absolute to relative paths and removing .. and
|
||||
# .
|
||||
fname = header.filename.replace(os.sep, '/')
|
||||
fname = os.path.splitdrive(fname)[1]
|
||||
parts = [x for x in fname.split('/') if x not in {'', os.path.pardir, os.path.curdir}]
|
||||
if not parts:
|
||||
continue
|
||||
if header.uncompressed_size == 0:
|
||||
# Directory
|
||||
f.seek(f.tell()+seekval)
|
||||
|
@ -17,8 +17,7 @@ class MReplace(UserDict):
|
||||
|
||||
def compile_regex(self):
|
||||
if len(self.data) > 0:
|
||||
keys = sorted(self.data.keys(), key=len)
|
||||
keys.reverse()
|
||||
keys = sorted(self.data.keys(), key=len, reverse=True)
|
||||
tmp = "(%s)" % "|".join(map(re.escape, keys))
|
||||
if self.re != tmp:
|
||||
self.re = tmp
|
||||
|
@ -1099,10 +1099,13 @@ class ZipFile:
|
||||
|
||||
base_target = targetpath # Added by Kovid
|
||||
|
||||
# don't include leading "/" from file name if present
|
||||
fname = member.filename
|
||||
if fname.startswith('/'):
|
||||
fname = fname[1:]
|
||||
# Sanitize path, changing absolute paths to relative paths
|
||||
# and removing .. and . (changed by Kovid)
|
||||
fname = member.filename.replace(os.sep, '/')
|
||||
fname = os.path.splitdrive(fname)[1]
|
||||
fname = '/'.join(x for x in fname.split('/') if x not in {'', os.path.curdir, os.path.pardir})
|
||||
if not fname:
|
||||
raise BadZipfile('The member %r has an invalid name'%member.filename)
|
||||
|
||||
targetpath = os.path.normpath(os.path.join(base_target, fname))
|
||||
|
||||
|