Sync to trunk.
@ -40,6 +40,7 @@ recipes/.gitignore
|
|||||||
recipes/README.md
|
recipes/README.md
|
||||||
recipes/icon_checker.py
|
recipes/icon_checker.py
|
||||||
recipes/readme_updater.py
|
recipes/readme_updater.py
|
||||||
|
recipes/garfield.recipe
|
||||||
recipes/katalog_egazeciarz.recipe
|
recipes/katalog_egazeciarz.recipe
|
||||||
recipes/tv_axnscifi.recipe
|
recipes/tv_axnscifi.recipe
|
||||||
recipes/tv_comedycentral.recipe
|
recipes/tv_comedycentral.recipe
|
||||||
@ -63,6 +64,7 @@ recipes/tv_tvppolonia.recipe
|
|||||||
recipes/tv_tvpuls.recipe
|
recipes/tv_tvpuls.recipe
|
||||||
recipes/tv_viasathistory.recipe
|
recipes/tv_viasathistory.recipe
|
||||||
recipes/icons/katalog_egazeciarz.png
|
recipes/icons/katalog_egazeciarz.png
|
||||||
|
recipes/icons/garfield.png
|
||||||
recipes/icons/tv_axnscifi.png
|
recipes/icons/tv_axnscifi.png
|
||||||
recipes/icons/tv_comedycentral.png
|
recipes/icons/tv_comedycentral.png
|
||||||
recipes/icons/tv_discoveryscience.png
|
recipes/icons/tv_discoveryscience.png
|
||||||
|
@ -20,6 +20,58 @@
|
|||||||
# new recipes:
|
# new recipes:
|
||||||
# - title:
|
# - title:
|
||||||
|
|
||||||
|
- version: 0.9.26
|
||||||
|
date: 2013-04-05
|
||||||
|
|
||||||
|
new features:
|
||||||
|
- title: "PDF Output: Allow using templates to create arbitrary headers and footers. Look under PDF Output in the conversion dialog for this feature."
|
||||||
|
|
||||||
|
- title: "ToC Editor: Allow generating the ToC directly from individual files inside the ebook. Useful for EPUBs that have individual chapters in single files."
|
||||||
|
tickets: [1163520]
|
||||||
|
|
||||||
|
- title: "ToC Editor: Add buttons to indent/unindent the current entry"
|
||||||
|
|
||||||
|
- title: "ToC Editor: Right-click menu to perform various useful actions on entries in the ToC"
|
||||||
|
|
||||||
|
- title: "Column icons: Allow use of wide images as column icons"
|
||||||
|
|
||||||
|
- title: "Add USB ids for the Palm Pre2 and Samsung Galaxy phone to the device drivers"
|
||||||
|
tickets: [1162293,1163115]
|
||||||
|
|
||||||
|
bug fixes:
|
||||||
|
- title: "PDF Output: Fix generating page numbers causing links to not work."
|
||||||
|
tickets: [1162573]
|
||||||
|
|
||||||
|
- title: "Wrong filename output in error message when 'Guide reference not found'"
|
||||||
|
tickets: [1163659]
|
||||||
|
|
||||||
|
- title: "Get Books: Update Amazon, Barnes & Noble, Waterstones and Gutenberg store plugins for website change"
|
||||||
|
|
||||||
|
- title: "PDF Output: Fix 1 pixel wide left and top margins on the cover page for some PDF conversions due to incorrect rounding."
|
||||||
|
tickets: [1162054]
|
||||||
|
|
||||||
|
- title: "ToC Editor: Fix drag and drop of multiple items resulting in the dropped items being in random order sometimes."
|
||||||
|
tickets: [1161999]
|
||||||
|
|
||||||
|
improved recipes:
|
||||||
|
- Financial Times UK
|
||||||
|
- Sing Tao Daily
|
||||||
|
- Apple Daily
|
||||||
|
- A List Apart
|
||||||
|
- Business Week
|
||||||
|
- Harpers printed edition
|
||||||
|
- Harvard Business Review
|
||||||
|
|
||||||
|
new recipes:
|
||||||
|
- title: AM730
|
||||||
|
author: Eddie Lau
|
||||||
|
|
||||||
|
- title: Arret sur images
|
||||||
|
author: Francois D
|
||||||
|
|
||||||
|
- title: Diario de Noticias
|
||||||
|
author: Jose Pinto
|
||||||
|
|
||||||
- version: 0.9.25
|
- version: 0.9.25
|
||||||
date: 2013-03-29
|
date: 2013-03-29
|
||||||
|
|
||||||
|
@ -1,3 +1,4 @@
|
|||||||
|
import re
|
||||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||||
from collections import OrderedDict
|
from collections import OrderedDict
|
||||||
|
|
||||||
@ -39,7 +40,7 @@ class BusinessWeekMagazine(BasicNewsRecipe):
|
|||||||
title=self.tag_to_string(div.a).strip()
|
title=self.tag_to_string(div.a).strip()
|
||||||
url=div.a['href']
|
url=div.a['href']
|
||||||
soup0 = self.index_to_soup(url)
|
soup0 = self.index_to_soup(url)
|
||||||
urlprint=soup0.find('li', attrs={'class':'print tracked'}).a['href']
|
urlprint=soup0.find('a', attrs={'href':re.compile('.*printer.*')})['href']
|
||||||
articles.append({'title':title, 'url':urlprint, 'description':'', 'date':''})
|
articles.append({'title':title, 'url':urlprint, 'description':'', 'date':''})
|
||||||
|
|
||||||
|
|
||||||
@ -56,7 +57,7 @@ class BusinessWeekMagazine(BasicNewsRecipe):
|
|||||||
title=self.tag_to_string(div.a).strip()
|
title=self.tag_to_string(div.a).strip()
|
||||||
url=div.a['href']
|
url=div.a['href']
|
||||||
soup0 = self.index_to_soup(url)
|
soup0 = self.index_to_soup(url)
|
||||||
urlprint=soup0.find('li', attrs={'class':'print tracked'}).a['href']
|
urlprint=soup0.find('a', attrs={'href':re.compile('.*printer.*')})['href']
|
||||||
articles.append({'title':title, 'url':urlprint, 'description':desc, 'date':''})
|
articles.append({'title':title, 'url':urlprint, 'description':desc, 'date':''})
|
||||||
|
|
||||||
if articles:
|
if articles:
|
||||||
|
23
recipes/diario_de_noticias.recipe
Normal file
@ -0,0 +1,23 @@
|
|||||||
|
# vim:fileencoding=UTF-8
|
||||||
|
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class AdvancedUserRecipe1365070687(BasicNewsRecipe):
|
||||||
|
title ='Diário de Notícias'
|
||||||
|
oldest_article = 7
|
||||||
|
language = 'pt'
|
||||||
|
__author__ = 'Jose Pinto'
|
||||||
|
max_articles_per_feed = 100
|
||||||
|
keep_only_tags = [dict(name='div', attrs={'id':'cln-esqmid'}) ]
|
||||||
|
remove_tags = [ dict(name='table', attrs={'class':'TabFerramentasInf'}) ]
|
||||||
|
|
||||||
|
feeds = [(u'Portugal', u'http://feeds.dn.pt/DN-Portugal'),
|
||||||
|
(u'Globo', u'http://feeds.dn.pt/DN-Globo'),
|
||||||
|
(u'Economia', u'http://feeds.dn.pt/DN-Economia'),
|
||||||
|
(u'Ci\xeancia', u'http://feeds.dn.pt/DN-Ciencia'),
|
||||||
|
(u'Artes', u'http://feeds.dn.pt/DN-Artes'),
|
||||||
|
(u'TV & Media', u'http://feeds.dn.pt/DN-Media'),
|
||||||
|
(u'Opini\xe3o', u'http://feeds.dn.pt/DN-Opiniao'),
|
||||||
|
(u'Pessoas', u'http://feeds.dn.pt/DN-Pessoas')
|
||||||
|
]
|
17
recipes/economia.recipe
Normal file
@ -0,0 +1,17 @@
|
|||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class AdvancedUserRecipe1314326622(BasicNewsRecipe):
|
||||||
|
title = u'Economia'
|
||||||
|
__author__ = 'Manish Bhattarai'
|
||||||
|
description = 'Economia - Intelligence & Insight for ICAEW Members'
|
||||||
|
language = 'en_GB'
|
||||||
|
oldest_article = 7
|
||||||
|
max_articles_per_feed = 25
|
||||||
|
masthead_url = 'http://economia.icaew.com/~/media/Images/Design%20Images/Economia_Red_website.ashx'
|
||||||
|
cover_url = 'http://economia.icaew.com/~/media/Images/Design%20Images/Economia_Red_website.ashx'
|
||||||
|
no_stylesheets = True
|
||||||
|
remove_empty_feeds = True
|
||||||
|
remove_tags_before = dict(id='content')
|
||||||
|
remove_tags_after = dict(id='stars-wrapper')
|
||||||
|
remove_tags = [dict(attrs={'class':['floatR', 'sharethis', 'rating clearfix']})]
|
||||||
|
feeds = [(u'News', u'http://feedity.com/icaew-com/VlNTVFRa.rss'),(u'Business', u'http://feedity.com/icaew-com/VlNTVFtS.rss'),(u'People', u'http://feedity.com/icaew-com/VlNTVFtX.rss'),(u'Opinion', u'http://feedity.com/icaew-com/VlNTVFtW.rss'),(u'Finance', u'http://feedity.com/icaew-com/VlNTVFtV.rss')]
|
@ -12,12 +12,6 @@ class EsensjaRSS(BasicNewsRecipe):
|
|||||||
language = 'pl'
|
language = 'pl'
|
||||||
encoding = 'utf-8'
|
encoding = 'utf-8'
|
||||||
INDEX = 'http://www.esensja.pl'
|
INDEX = 'http://www.esensja.pl'
|
||||||
extra_css = '''.t-title {font-size: x-large; font-weight: bold; text-align: left}
|
|
||||||
.t-author {font-size: x-small; text-align: left}
|
|
||||||
.t-title2 {font-size: x-small; font-style: italic; text-align: left}
|
|
||||||
.text {font-size: small; text-align: left}
|
|
||||||
.annot-ref {font-style: italic; text-align: left}
|
|
||||||
'''
|
|
||||||
cover_url = ''
|
cover_url = ''
|
||||||
masthead_url = 'http://esensja.pl/img/wrss.gif'
|
masthead_url = 'http://esensja.pl/img/wrss.gif'
|
||||||
use_embedded_content = False
|
use_embedded_content = False
|
||||||
|
@ -110,10 +110,12 @@ class FinancialTimes(BasicNewsRecipe):
|
|||||||
soup = self.index_to_soup(self.INDEX)
|
soup = self.index_to_soup(self.INDEX)
|
||||||
#dates= self.tag_to_string(soup.find('div', attrs={'class':'btm-links'}).find('div'))
|
#dates= self.tag_to_string(soup.find('div', attrs={'class':'btm-links'}).find('div'))
|
||||||
#self.timefmt = ' [%s]'%dates
|
#self.timefmt = ' [%s]'%dates
|
||||||
|
section_title = 'Untitled'
|
||||||
|
|
||||||
for column in soup.findAll('div', attrs = {'class':'feedBoxes clearfix'}):
|
for column in soup.findAll('div', attrs = {'class':'feedBoxes clearfix'}):
|
||||||
for section in column. findAll('div', attrs = {'class':'feedBox'}):
|
for section in column. findAll('div', attrs = {'class':'feedBox'}):
|
||||||
section_title=self.tag_to_string(section.find('h4'))
|
sectiontitle=self.tag_to_string(section.find('h4'))
|
||||||
|
if '...' not in sectiontitle: section_title=sectiontitle
|
||||||
for article in section.ul.findAll('li'):
|
for article in section.ul.findAll('li'):
|
||||||
articles = []
|
articles = []
|
||||||
title=self.tag_to_string(article.a)
|
title=self.tag_to_string(article.a)
|
||||||
|
53
recipes/forbes_pl.recipe
Normal file
@ -0,0 +1,53 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
__license__ = 'GPL v3'
|
||||||
|
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
import datetime
|
||||||
|
import re
|
||||||
|
|
||||||
|
class forbes_pl(BasicNewsRecipe):
|
||||||
|
title = u'Forbes.pl'
|
||||||
|
__author__ = 'Artur Stachecki <artur.stachecki@gmail.com>'
|
||||||
|
language = 'pl'
|
||||||
|
description = u'Biznes, finanse, gospodarka, strategie, wiadomości gospodarcze, analizy finasowe i strategiczne.'
|
||||||
|
oldest_article = 1
|
||||||
|
index = 'http://www.forbes.pl'
|
||||||
|
cover_url = 'http://www.forbes.pl/resources/front/images/logo.png'
|
||||||
|
max_articles_per_feed = 100
|
||||||
|
extra_css = '.Block-Photo {float:left; max-width: 300px; margin-right: 5px;}'
|
||||||
|
preprocess_regexps = [(re.compile(ur'<p>(<strong>)?(Czytaj|Zobacz) (też|także):.*?</p>', re.DOTALL), lambda match: ''), (re.compile(ur'<strong>Zobacz:.*?</strong>', re.DOTALL), lambda match: '')]
|
||||||
|
remove_javascript = True
|
||||||
|
no_stylesheets = True
|
||||||
|
now = datetime.datetime.now()
|
||||||
|
yesterday = now - datetime.timedelta(hours=24)
|
||||||
|
yesterday = yesterday.strftime("%d.%m.%Y %H:%M:%S")
|
||||||
|
pages_count = 4
|
||||||
|
keep_only_tags = [dict(attrs={'class':['Block-Node Content-Article ', 'Block-Node Content-Article piano-closed']})]
|
||||||
|
remove_tags = [dict(attrs={'class':['Keywords Styled', 'twitter-share-button', 'Block-List-Related Block-List']})]
|
||||||
|
|
||||||
|
feeds = [(u'Wszystkie', 'http://www.forbes.pl/rss')]
|
||||||
|
|
||||||
|
'''def preprocess_html(self, soup):
|
||||||
|
self.append_page(soup, soup.body)
|
||||||
|
return soup
|
||||||
|
|
||||||
|
|
||||||
|
def append_page(self, soup, appendtag):
|
||||||
|
cleanup = False
|
||||||
|
nexturl = appendtag.find('a', attrs={'class':'next'})
|
||||||
|
if nexturl:
|
||||||
|
cleanup = True
|
||||||
|
while nexturl:
|
||||||
|
soup2 = self.index_to_soup(self.index + nexturl['href'])
|
||||||
|
nexturl = soup2.find('a', attrs={'class':'next'})
|
||||||
|
pagetext = soup2.findAll(id='article-body-wrapper')
|
||||||
|
if not pagetext:
|
||||||
|
pagetext = soup2.findAll(attrs={'class':'Article-Entry Styled'})
|
||||||
|
for comment in pagetext.findAll(text=lambda text:isinstance(text, Comment)):
|
||||||
|
comment.extract()
|
||||||
|
pos = len(appendtag.contents)
|
||||||
|
appendtag.insert(pos, pagetext)
|
||||||
|
if cleanup:
|
||||||
|
for r in appendtag.findAll(attrs={'class':'paginator'}):
|
||||||
|
r.extract()'''
|
108
recipes/galaxys_edge.recipe
Normal file
@ -0,0 +1,108 @@
|
|||||||
|
from __future__ import with_statement
|
||||||
|
__license__ = 'GPL 3'
|
||||||
|
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||||
|
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class GalaxyEdge(BasicNewsRecipe):
|
||||||
|
title = u'The Galaxy\'s Edge'
|
||||||
|
language = 'en'
|
||||||
|
|
||||||
|
oldest_article = 7
|
||||||
|
__author__ = 'Krittika Goyal'
|
||||||
|
no_stylesheets = True
|
||||||
|
|
||||||
|
auto_cleanup = True
|
||||||
|
|
||||||
|
#keep_only_tags = [dict(id='content')]
|
||||||
|
#remove_tags = [dict(attrs={'class':['article-links', 'breadcr']}),
|
||||||
|
#dict(id=['email-section', 'right-column', 'printfooter', 'topover',
|
||||||
|
#'slidebox', 'th_footer'])]
|
||||||
|
|
||||||
|
extra_css = '.photo-caption { font-size: smaller }'
|
||||||
|
|
||||||
|
def parse_index(self):
|
||||||
|
soup = self.index_to_soup('http://www.galaxysedge.com/')
|
||||||
|
main = soup.find('table', attrs={'width':'911'})
|
||||||
|
toc = main.find('td', attrs={'width':'225'})
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
current_section = None
|
||||||
|
current_articles = []
|
||||||
|
feeds = []
|
||||||
|
c = 0
|
||||||
|
for x in toc.findAll(['p']):
|
||||||
|
c = c+1
|
||||||
|
if c == 5:
|
||||||
|
if current_articles and current_section:
|
||||||
|
feeds.append((current_section, current_articles))
|
||||||
|
edwo = x.find('a')
|
||||||
|
current_section = self.tag_to_string(edwo)
|
||||||
|
current_articles = []
|
||||||
|
self.log('\tFound section:', current_section)
|
||||||
|
title = self.tag_to_string(edwo)
|
||||||
|
url = edwo.get('href', True)
|
||||||
|
url = 'http://www.galaxysedge.com/'+url
|
||||||
|
print(title)
|
||||||
|
print(c)
|
||||||
|
if not url or not title:
|
||||||
|
continue
|
||||||
|
self.log('\t\tFound article:', title)
|
||||||
|
self.log('\t\t\t', url)
|
||||||
|
current_articles.append({'title': title, 'url':url,
|
||||||
|
'description':'', 'date':''})
|
||||||
|
elif c>5:
|
||||||
|
current_section = self.tag_to_string(x.find('b'))
|
||||||
|
current_articles = []
|
||||||
|
self.log('\tFound section:', current_section)
|
||||||
|
for y in x.findAll('a'):
|
||||||
|
title = self.tag_to_string(y)
|
||||||
|
url = y.get('href', True)
|
||||||
|
url = 'http://www.galaxysedge.com/'+url
|
||||||
|
print(title)
|
||||||
|
if not url or not title:
|
||||||
|
continue
|
||||||
|
self.log('\t\tFound article:', title)
|
||||||
|
self.log('\t\t\t', url)
|
||||||
|
current_articles.append({'title': title, 'url':url,
|
||||||
|
'description':'', 'date':''})
|
||||||
|
if current_articles and current_section:
|
||||||
|
feeds.append((current_section, current_articles))
|
||||||
|
|
||||||
|
return feeds
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
#def preprocess_raw_html(self, raw, url):
|
||||||
|
#return raw.replace('<body><p>', '<p>').replace('</p></body>', '</p>')
|
||||||
|
|
||||||
|
#def postprocess_html(self, soup, first_fetch):
|
||||||
|
#for t in soup.findAll(['table', 'tr', 'td','center']):
|
||||||
|
#t.name = 'div'
|
||||||
|
#return soup
|
||||||
|
|
||||||
|
#def parse_index(self):
|
||||||
|
#today = time.strftime('%Y-%m-%d')
|
||||||
|
#soup = self.index_to_soup(
|
||||||
|
#'http://www.thehindu.com/todays-paper/tp-index/?date=' + today)
|
||||||
|
#div = soup.find(id='left-column')
|
||||||
|
#feeds = []
|
||||||
|
#current_section = None
|
||||||
|
#current_articles = []
|
||||||
|
#for x in div.findAll(['h3', 'div']):
|
||||||
|
#if current_section and x.get('class', '') == 'tpaper':
|
||||||
|
#a = x.find('a', href=True)
|
||||||
|
#if a is not None:
|
||||||
|
#current_articles.append({'url':a['href']+'?css=print',
|
||||||
|
#'title':self.tag_to_string(a), 'date': '',
|
||||||
|
#'description':''})
|
||||||
|
#if x.name == 'h3':
|
||||||
|
#if current_section and current_articles:
|
||||||
|
#feeds.append((current_section, current_articles))
|
||||||
|
#current_section = self.tag_to_string(x)
|
||||||
|
#current_articles = []
|
||||||
|
#return feeds
|
||||||
|
|
||||||
|
|
@ -10,7 +10,7 @@ krakow.gazeta.pl
|
|||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
class gw_krakow(BasicNewsRecipe):
|
class gw_krakow(BasicNewsRecipe):
|
||||||
title = u'Gazeta.pl Kraków'
|
title = u'Gazeta Wyborcza Kraków'
|
||||||
__author__ = 'teepel <teepel44@gmail.com> based on GW from fenuks'
|
__author__ = 'teepel <teepel44@gmail.com> based on GW from fenuks'
|
||||||
language = 'pl'
|
language = 'pl'
|
||||||
description =u'Wiadomości z Krakowa na portalu Gazeta.pl.'
|
description =u'Wiadomości z Krakowa na portalu Gazeta.pl.'
|
||||||
|
@ -5,7 +5,7 @@ import string
|
|||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
class GazetaPlSzczecin(BasicNewsRecipe):
|
class GazetaPlSzczecin(BasicNewsRecipe):
|
||||||
title = u'Gazeta.pl Szczecin'
|
title = u'Gazeta Wyborcza Szczecin'
|
||||||
description = u'Wiadomości ze Szczecina na portalu Gazeta.pl.'
|
description = u'Wiadomości ze Szczecina na portalu Gazeta.pl.'
|
||||||
__author__ = u'Michał Szkutnik'
|
__author__ = u'Michał Szkutnik'
|
||||||
__license__ = u'GPL v3'
|
__license__ = u'GPL v3'
|
||||||
|
@ -10,7 +10,7 @@ warszawa.gazeta.pl
|
|||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
class gw_wawa(BasicNewsRecipe):
|
class gw_wawa(BasicNewsRecipe):
|
||||||
title = u'Gazeta.pl Warszawa'
|
title = u'Gazeta Wyborcza Warszawa'
|
||||||
__author__ = 'teepel <teepel44@gmail.com> based on GW from fenuks'
|
__author__ = 'teepel <teepel44@gmail.com> based on GW from fenuks'
|
||||||
language = 'pl'
|
language = 'pl'
|
||||||
description ='Wiadomości z Warszawy na portalu Gazeta.pl.'
|
description ='Wiadomości z Warszawy na portalu Gazeta.pl.'
|
||||||
|
@ -3,7 +3,7 @@ from calibre.web.feeds.news import BasicNewsRecipe
|
|||||||
from calibre.ebooks.BeautifulSoup import Comment
|
from calibre.ebooks.BeautifulSoup import Comment
|
||||||
|
|
||||||
class Gazeta_Wyborcza(BasicNewsRecipe):
|
class Gazeta_Wyborcza(BasicNewsRecipe):
|
||||||
title = u'Gazeta.pl'
|
title = u'Gazeta Wyborcza'
|
||||||
__author__ = 'fenuks, Artur Stachecki'
|
__author__ = 'fenuks, Artur Stachecki'
|
||||||
language = 'pl'
|
language = 'pl'
|
||||||
description = 'Wiadomości z Polski i ze świata. Serwisy tematyczne i lokalne w 20 miastach.'
|
description = 'Wiadomości z Polski i ze świata. Serwisy tematyczne i lokalne w 20 miastach.'
|
||||||
|
@ -20,7 +20,7 @@ class HBR(BasicNewsRecipe):
|
|||||||
'articleToolbarTopRD', 'pageRightSubColumn', 'pageRightColumn',
|
'articleToolbarTopRD', 'pageRightSubColumn', 'pageRightColumn',
|
||||||
'todayOnHBRListWidget', 'mostWidget', 'keepUpWithHBR',
|
'todayOnHBRListWidget', 'mostWidget', 'keepUpWithHBR',
|
||||||
'mailingListTout', 'partnerCenter', 'pageFooter',
|
'mailingListTout', 'partnerCenter', 'pageFooter',
|
||||||
'superNavHeadContainer', 'hbrDisqus',
|
'superNavHeadContainer', 'hbrDisqus', 'article-toolbox',
|
||||||
'articleToolbarTop', 'articleToolbarBottom', 'articleToolbarRD']),
|
'articleToolbarTop', 'articleToolbarBottom', 'articleToolbarRD']),
|
||||||
dict(name='iframe')]
|
dict(name='iframe')]
|
||||||
extra_css = '''
|
extra_css = '''
|
||||||
|
BIN
recipes/icons/forbes_pl.png
Normal file
After Width: | Height: | Size: 1.2 KiB |
Before Width: | Height: | Size: 802 B After Width: | Height: | Size: 294 B |
Before Width: | Height: | Size: 802 B After Width: | Height: | Size: 294 B |
Before Width: | Height: | Size: 802 B After Width: | Height: | Size: 294 B |
Before Width: | Height: | Size: 802 B After Width: | Height: | Size: 294 B |
BIN
recipes/icons/slashdot.png
Normal file
After Width: | Height: | Size: 250 B |
BIN
recipes/icons/sportowefakty.png
Normal file
After Width: | Height: | Size: 511 B |
BIN
recipes/icons/wysokie_obcasy.png
Normal file
After Width: | Height: | Size: 205 B |
@ -1,64 +1,44 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
__license__ = 'GPL v3'
|
__license__ = 'GPL v3'
|
||||||
__copyright__ = '2008-2013, Darko Miletic <darko.miletic at gmail.com>'
|
|
||||||
'''
|
|
||||||
newyorker.com
|
|
||||||
'''
|
|
||||||
|
|
||||||
|
'''
|
||||||
|
www.canada.com
|
||||||
|
'''
|
||||||
|
import re
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup
|
||||||
|
|
||||||
class NewYorker(BasicNewsRecipe):
|
class NewYorker(BasicNewsRecipe):
|
||||||
title = 'The New Yorker'
|
|
||||||
__author__ = 'Darko Miletic'
|
|
||||||
description = 'The best of US journalism'
|
|
||||||
oldest_article = 15
|
|
||||||
language = 'en'
|
|
||||||
max_articles_per_feed = 100
|
|
||||||
no_stylesheets = True
|
|
||||||
use_embedded_content = False
|
|
||||||
publisher = 'Conde Nast Publications'
|
|
||||||
category = 'news, politics, USA'
|
|
||||||
encoding = 'cp1252'
|
|
||||||
publication_type = 'magazine'
|
|
||||||
masthead_url = 'http://www.newyorker.com/css/i/hed/logo.gif'
|
|
||||||
extra_css = """
|
|
||||||
body {font-family: "Times New Roman",Times,serif}
|
|
||||||
.articleauthor{color: #9F9F9F;
|
|
||||||
font-family: Arial, sans-serif;
|
|
||||||
font-size: small;
|
|
||||||
text-transform: uppercase}
|
|
||||||
.rubric,.dd,h6#credit{color: #CD0021;
|
|
||||||
font-family: Arial, sans-serif;
|
|
||||||
font-size: small;
|
|
||||||
text-transform: uppercase}
|
|
||||||
.descender:first-letter{display: inline; font-size: xx-large; font-weight: bold}
|
|
||||||
.dd,h6#credit{color: gray}
|
|
||||||
.c{display: block}
|
|
||||||
.caption,h2#articleintro{font-style: italic}
|
|
||||||
.caption{font-size: small}
|
|
||||||
"""
|
|
||||||
|
|
||||||
conversion_options = {
|
|
||||||
'comment' : description
|
|
||||||
, 'tags' : category
|
|
||||||
, 'publisher' : publisher
|
|
||||||
, 'language' : language
|
|
||||||
}
|
|
||||||
|
|
||||||
keep_only_tags = [dict(name='div', attrs={'id':'pagebody'})]
|
title = u'New Yorker Magazine'
|
||||||
remove_tags = [
|
newyorker_prefix = 'http://m.newyorker.com'
|
||||||
dict(name=['meta','iframe','base','link','embed','object'])
|
description = u'Content from the New Yorker website'
|
||||||
,dict(attrs={'class':['utils','socialUtils','articleRailLinks','icons','social-utils-top','entry-keywords','entry-categories','utilsPrintEmail'] })
|
fp_tag = 'CAN_TC'
|
||||||
,dict(attrs={'id':['show-header','show-footer'] })
|
|
||||||
]
|
|
||||||
remove_tags_after = dict(attrs={'class':'entry-content'})
|
|
||||||
remove_attributes = ['lang']
|
|
||||||
feeds = [(u'The New Yorker', u'http://www.newyorker.com/services/mrss/feeds/everything.xml')]
|
|
||||||
|
|
||||||
def print_version(self, url):
|
masthead_url = 'http://www.newyorker.com/images/elements/print/newyorker_printlogo.gif'
|
||||||
return url + '?printable=true¤tPage=all'
|
|
||||||
|
|
||||||
def image_url_processor(self, baseurl, url):
|
compress_news_images = True
|
||||||
return url.strip()
|
compress_news_images_auto_size = 8
|
||||||
|
scale_news_images_to_device = False
|
||||||
|
scale_news_images = (768, 1024)
|
||||||
|
|
||||||
|
url_list = []
|
||||||
|
language = 'en'
|
||||||
|
__author__ = 'Nick Redding'
|
||||||
|
no_stylesheets = True
|
||||||
|
timefmt = ' [%b %d]'
|
||||||
|
encoding = 'utf-8'
|
||||||
|
extra_css = '''
|
||||||
|
.byline { font-size:xx-small; font-weight: bold;}
|
||||||
|
h3 { margin-bottom: 6px; }
|
||||||
|
.caption { font-size: xx-small; font-style: italic; font-weight: normal; }
|
||||||
|
'''
|
||||||
|
keep_only_tags = [dict(name='div', attrs={'id':re.compile('pagebody')})]
|
||||||
|
|
||||||
|
remove_tags = [{'class':'socialUtils'},{'class':'entry-keywords'}]
|
||||||
|
|
||||||
def get_cover_url(self):
|
def get_cover_url(self):
|
||||||
cover_url = "http://www.newyorker.com/images/covers/1925/1925_02_21_p233.jpg"
|
cover_url = "http://www.newyorker.com/images/covers/1925/1925_02_21_p233.jpg"
|
||||||
@ -68,13 +48,233 @@ class NewYorker(BasicNewsRecipe):
|
|||||||
cover_url = 'http://www.newyorker.com' + cover_item.div.img['src'].strip()
|
cover_url = 'http://www.newyorker.com' + cover_item.div.img['src'].strip()
|
||||||
return cover_url
|
return cover_url
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
def fixChars(self,string):
|
||||||
for item in soup.findAll(style=True):
|
# Replace lsquo (\x91)
|
||||||
del item['style']
|
fixed = re.sub("\x91","‘",string)
|
||||||
auth = soup.find(attrs={'id':'articleauthor'})
|
# Replace rsquo (\x92)
|
||||||
if auth:
|
fixed = re.sub("\x92","’",fixed)
|
||||||
alink = auth.find('a')
|
# Replace ldquo (\x93)
|
||||||
if alink and alink.string is not None:
|
fixed = re.sub("\x93","“",fixed)
|
||||||
txt = alink.string
|
# Replace rdquo (\x94)
|
||||||
alink.replaceWith(txt)
|
fixed = re.sub("\x94","”",fixed)
|
||||||
|
# Replace ndash (\x96)
|
||||||
|
fixed = re.sub("\x96","–",fixed)
|
||||||
|
# Replace mdash (\x97)
|
||||||
|
fixed = re.sub("\x97","—",fixed)
|
||||||
|
fixed = re.sub("’","’",fixed)
|
||||||
|
return fixed
|
||||||
|
|
||||||
|
def massageNCXText(self, description):
|
||||||
|
# Kindle TOC descriptions won't render certain characters
|
||||||
|
if description:
|
||||||
|
massaged = unicode(BeautifulStoneSoup(description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES))
|
||||||
|
# Replace '&' with '&'
|
||||||
|
massaged = re.sub("&","&", massaged)
|
||||||
|
return self.fixChars(massaged)
|
||||||
|
else:
|
||||||
|
return description
|
||||||
|
|
||||||
|
def populate_article_metadata(self, article, soup, first):
|
||||||
|
if first:
|
||||||
|
picdiv = soup.find('body').find('img')
|
||||||
|
if picdiv is not None:
|
||||||
|
self.add_toc_thumbnail(article,re.sub(r'links\\link\d+\\','',picdiv['src']))
|
||||||
|
xtitle = article.text_summary.strip()
|
||||||
|
if len(xtitle) == 0:
|
||||||
|
desc = soup.find('meta',attrs={'property':'og:description'})
|
||||||
|
if desc is not None:
|
||||||
|
article.summary = article.text_summary = desc['content']
|
||||||
|
shortparagraph = ""
|
||||||
|
## try:
|
||||||
|
if len(article.text_summary.strip()) == 0:
|
||||||
|
articlebodies = soup.findAll('div',attrs={'class':'entry-content'})
|
||||||
|
if articlebodies:
|
||||||
|
for articlebody in articlebodies:
|
||||||
|
if articlebody:
|
||||||
|
paras = articlebody.findAll('p')
|
||||||
|
for p in paras:
|
||||||
|
refparagraph = self.massageNCXText(self.tag_to_string(p,use_alt=False)).strip()
|
||||||
|
#account for blank paragraphs and short paragraphs by appending them to longer ones
|
||||||
|
if len(refparagraph) > 0:
|
||||||
|
if len(refparagraph) > 70: #approximately one line of text
|
||||||
|
newpara = shortparagraph + refparagraph
|
||||||
|
article.summary = article.text_summary = newpara.strip()
|
||||||
|
return
|
||||||
|
else:
|
||||||
|
shortparagraph = refparagraph + " "
|
||||||
|
if shortparagraph.strip().find(" ") == -1 and not shortparagraph.strip().endswith(":"):
|
||||||
|
shortparagraph = shortparagraph + "- "
|
||||||
|
else:
|
||||||
|
article.summary = article.text_summary = self.massageNCXText(article.text_summary)
|
||||||
|
## except:
|
||||||
|
## self.log("Error creating article descriptions")
|
||||||
|
## return
|
||||||
|
|
||||||
|
|
||||||
|
def strip_anchors(self,soup):
|
||||||
|
paras = soup.findAll(True)
|
||||||
|
for para in paras:
|
||||||
|
aTags = para.findAll('a')
|
||||||
|
for a in aTags:
|
||||||
|
if a.img is None:
|
||||||
|
a.replaceWith(a.renderContents().decode('cp1252','replace'))
|
||||||
return soup
|
return soup
|
||||||
|
|
||||||
|
def preprocess_html(self,soup):
|
||||||
|
dateline = soup.find('div','published')
|
||||||
|
byline = soup.find('div','byline')
|
||||||
|
title = soup.find('h1','entry-title')
|
||||||
|
if title is None:
|
||||||
|
return self.strip_anchors(soup)
|
||||||
|
if byline is None:
|
||||||
|
title.append(dateline)
|
||||||
|
return self.strip_anchors(soup)
|
||||||
|
byline.append(dateline)
|
||||||
|
return self.strip_anchors(soup)
|
||||||
|
|
||||||
|
def load_global_nav(self,soup):
|
||||||
|
seclist = []
|
||||||
|
ul = soup.find('ul',attrs={'id':re.compile('global-nav-menu')})
|
||||||
|
if ul is not None:
|
||||||
|
for li in ul.findAll('li'):
|
||||||
|
if li.a is not None:
|
||||||
|
securl = li.a['href']
|
||||||
|
if securl != '/' and securl != '/magazine' and securl.startswith('/'):
|
||||||
|
seclist.append((self.tag_to_string(li.a),self.newyorker_prefix+securl))
|
||||||
|
return seclist
|
||||||
|
|
||||||
|
def exclude_url(self,url):
|
||||||
|
if url in self.url_list:
|
||||||
|
return True
|
||||||
|
if not url.endswith('html'):
|
||||||
|
return True
|
||||||
|
if 'goings-on-about-town-app' in url:
|
||||||
|
return True
|
||||||
|
if 'something-to-be-thankful-for' in url:
|
||||||
|
return True
|
||||||
|
if '/shouts/' in url:
|
||||||
|
return True
|
||||||
|
if 'out-loud' in url:
|
||||||
|
return True
|
||||||
|
if '/rss/' in url:
|
||||||
|
return True
|
||||||
|
if '/video-' in url:
|
||||||
|
return True
|
||||||
|
self.url_list.append(url)
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def load_index_page(self,soup):
|
||||||
|
article_list = []
|
||||||
|
for div in soup.findAll('div',attrs={'class':re.compile('^rotator')}):
|
||||||
|
h2 = div.h2
|
||||||
|
if h2 is not None:
|
||||||
|
a = h2.a
|
||||||
|
if a is not None:
|
||||||
|
url = a['href']
|
||||||
|
if not self.exclude_url(url):
|
||||||
|
if url.startswith('/'):
|
||||||
|
url = self.newyorker_prefix+url
|
||||||
|
byline = h2.span
|
||||||
|
if byline is not None:
|
||||||
|
author = self.tag_to_string(byline)
|
||||||
|
if author.startswith('by '):
|
||||||
|
author.replace('by ','')
|
||||||
|
byline.extract()
|
||||||
|
else:
|
||||||
|
author = ''
|
||||||
|
if h2.br is not None:
|
||||||
|
h2.br.replaceWith(' ')
|
||||||
|
title = self.tag_to_string(h2)
|
||||||
|
desc = div.find(attrs={'class':['rotator-ad-body','feature-blurb-text']})
|
||||||
|
if desc is not None:
|
||||||
|
description = self.tag_to_string(desc)
|
||||||
|
else:
|
||||||
|
description = ''
|
||||||
|
article_list.append(dict(title=title,url=url,date='',description=description,author=author,content=''))
|
||||||
|
ul = div.find('ul','feature-blurb-links')
|
||||||
|
if ul is not None:
|
||||||
|
for li in ul.findAll('li'):
|
||||||
|
a = li.a
|
||||||
|
if a is not None:
|
||||||
|
url = a['href']
|
||||||
|
if not self.exclude_url(url):
|
||||||
|
if url.startswith('/'):
|
||||||
|
url = self.newyorker_prefix+url
|
||||||
|
if a.br is not None:
|
||||||
|
a.br.replaceWith(' ')
|
||||||
|
title = '>>'+self.tag_to_string(a)
|
||||||
|
article_list.append(dict(title=title,url=url,date='',description='',author='',content=''))
|
||||||
|
for h3 in soup.findAll('h3','header'):
|
||||||
|
a = h3.a
|
||||||
|
if a is not None:
|
||||||
|
url = a['href']
|
||||||
|
if not self.exclude_url(url):
|
||||||
|
if url.startswith('/'):
|
||||||
|
url = self.newyorker_prefix+url
|
||||||
|
byline = h3.span
|
||||||
|
if byline is not None:
|
||||||
|
author = self.tag_to_string(byline)
|
||||||
|
if author.startswith('by '):
|
||||||
|
author = author.replace('by ','')
|
||||||
|
byline.extract()
|
||||||
|
else:
|
||||||
|
author = ''
|
||||||
|
if h3.br is not None:
|
||||||
|
h3.br.replaceWith(' ')
|
||||||
|
title = self.tag_to_string(h3).strip()
|
||||||
|
article_list.append(dict(title=title,url=url,date='',description='',author=author,content=''))
|
||||||
|
return article_list
|
||||||
|
|
||||||
|
def load_global_section(self,securl):
|
||||||
|
article_list = []
|
||||||
|
try:
|
||||||
|
soup = self.index_to_soup(securl)
|
||||||
|
except:
|
||||||
|
return article_list
|
||||||
|
if '/blogs/' not in securl:
|
||||||
|
return self.load_index_page(soup)
|
||||||
|
for div in soup.findAll('div',attrs={'id':re.compile('^entry')}):
|
||||||
|
h3 = div.h3
|
||||||
|
if h3 is not None:
|
||||||
|
a = h3.a
|
||||||
|
if a is not None:
|
||||||
|
url = a['href']
|
||||||
|
if not self.exclude_url(url):
|
||||||
|
if url.startswith('/'):
|
||||||
|
url = self.newyorker_prefix+url
|
||||||
|
if h3.br is not None:
|
||||||
|
h3.br.replaceWith(' ')
|
||||||
|
title = self.tag_to_string(h3)
|
||||||
|
article_list.append(dict(title=title,url=url,date='',description='',author='',content=''))
|
||||||
|
return article_list
|
||||||
|
|
||||||
|
def filter_ans(self, ans) :
|
||||||
|
total_article_count = 0
|
||||||
|
idx = 0
|
||||||
|
idx_max = len(ans)-1
|
||||||
|
while idx <= idx_max:
|
||||||
|
if True: #self.verbose
|
||||||
|
self.log("Section %s: %d articles" % (ans[idx][0], len(ans[idx][1])) )
|
||||||
|
for article in ans[idx][1]:
|
||||||
|
total_article_count += 1
|
||||||
|
if True: #self.verbose
|
||||||
|
self.log("\t%-40.40s... \t%-60.60s..." % (article['title'].encode('cp1252','replace'),
|
||||||
|
article['url'].replace('http://m.newyorker.com','').encode('cp1252','replace')))
|
||||||
|
idx = idx+1
|
||||||
|
self.log( "Queued %d articles" % total_article_count )
|
||||||
|
return ans
|
||||||
|
|
||||||
|
|
||||||
|
def parse_index(self):
|
||||||
|
ans = []
|
||||||
|
try:
|
||||||
|
soup = self.index_to_soup(self.newyorker_prefix)
|
||||||
|
except:
|
||||||
|
return ans
|
||||||
|
seclist = self.load_global_nav(soup)
|
||||||
|
ans.append(('Front Page',self.load_index_page(soup)))
|
||||||
|
for (sectitle,securl) in seclist:
|
||||||
|
ans.append((sectitle,self.load_global_section(securl)))
|
||||||
|
return self.filter_ans(ans)
|
||||||
|
|
||||||
|
70
recipes/sportowefakty.recipe
Normal file
@ -0,0 +1,70 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
__license__ = 'GPL v3'
|
||||||
|
|
||||||
|
import re
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
from calibre.utils.magick import Image
|
||||||
|
|
||||||
|
class sportowefakty(BasicNewsRecipe):
|
||||||
|
title = u'SportoweFakty'
|
||||||
|
__author__ = 'Artur Stachecki <artur.stachecki@gmail.com>, Tomasz Długosz <tomek3d@gmail.com>'
|
||||||
|
language = 'pl'
|
||||||
|
description = u'Najważniejsze informacje sportowe z kraju i ze świata, relacje, komentarze, wywiady, zdjęcia!'
|
||||||
|
oldest_article = 1
|
||||||
|
masthead_url='http://www.sportowefakty.pl/images/logo.png'
|
||||||
|
max_articles_per_feed = 100
|
||||||
|
simultaneous_downloads = 5
|
||||||
|
use_embedded_content=False
|
||||||
|
remove_javascript=True
|
||||||
|
no_stylesheets=True
|
||||||
|
ignore_duplicate_articles = {'title', 'url'}
|
||||||
|
|
||||||
|
keep_only_tags = [dict(attrs = {'class' : 'box-article'})]
|
||||||
|
remove_tags =[]
|
||||||
|
remove_tags.append(dict(attrs = {'class' : re.compile(r'^newsStream')}))
|
||||||
|
remove_tags.append(dict(attrs = {'target' : '_blank'}))
|
||||||
|
|
||||||
|
feeds = [
|
||||||
|
(u'Piłka Nożna', u'http://www.sportowefakty.pl/pilka-nozna/index.rss'),
|
||||||
|
(u'Koszykówka', u'http://www.sportowefakty.pl/koszykowka/index.rss'),
|
||||||
|
(u'Żużel', u'http://www.sportowefakty.pl/zuzel/index.rss'),
|
||||||
|
(u'Siatkówka', u'http://www.sportowefakty.pl/siatkowka/index.rss'),
|
||||||
|
(u'Zimowe', u'http://www.sportowefakty.pl/zimowe/index.rss'),
|
||||||
|
(u'Hokej', u'http://www.sportowefakty.pl/hokej/index.rss'),
|
||||||
|
(u'Moto', u'http://www.sportowefakty.pl/moto/index.rss'),
|
||||||
|
(u'Tenis', u'http://www.sportowefakty.pl/tenis/index.rss')
|
||||||
|
]
|
||||||
|
|
||||||
|
def get_article_url(self, article):
|
||||||
|
link = article.get('link', None)
|
||||||
|
if 'utm_source' in link:
|
||||||
|
return link.split('?utm')[0]
|
||||||
|
else:
|
||||||
|
return link
|
||||||
|
|
||||||
|
def print_version(self, url):
|
||||||
|
print_url = url + '/drukuj'
|
||||||
|
return print_url
|
||||||
|
|
||||||
|
def preprocess_html(self, soup):
|
||||||
|
head = soup.find('h1')
|
||||||
|
if 'Fotorelacja' in self.tag_to_string(head):
|
||||||
|
return None
|
||||||
|
else:
|
||||||
|
for alink in soup.findAll('a'):
|
||||||
|
if alink.string is not None:
|
||||||
|
tstr = alink.string
|
||||||
|
alink.replaceWith(tstr)
|
||||||
|
return soup
|
||||||
|
|
||||||
|
def postprocess_html(self, soup, first):
|
||||||
|
for tag in soup.findAll(lambda tag: tag.name.lower()=='img' and tag.has_key('src')):
|
||||||
|
iurl = tag['src']
|
||||||
|
img = Image()
|
||||||
|
img.open(iurl)
|
||||||
|
if img < 0:
|
||||||
|
raise RuntimeError('Out of memory')
|
||||||
|
img.type = "GrayscaleType"
|
||||||
|
img.save(iurl)
|
||||||
|
return soup
|
@ -36,47 +36,21 @@ class TheOnion(BasicNewsRecipe):
|
|||||||
, 'publisher': publisher
|
, 'publisher': publisher
|
||||||
, 'language' : language
|
, 'language' : language
|
||||||
}
|
}
|
||||||
|
keep_only_tags = [dict(name='article', attrs={'class':'full-article'})]
|
||||||
keep_only_tags = [
|
|
||||||
dict(name='h2', attrs={'class':['section_title','title']})
|
|
||||||
,dict(attrs={'class':['main_image','meta','article_photo_lead','article_body']})
|
|
||||||
,dict(attrs={'id':['entries']})
|
|
||||||
]
|
|
||||||
remove_attributes=['lang','rel']
|
|
||||||
remove_tags_after = dict(attrs={'class':['article_body','feature_content']})
|
|
||||||
remove_tags = [
|
remove_tags = [
|
||||||
dict(name=['object','link','iframe','base','meta'])
|
dict(name=['nav', 'aside', 'section', 'meta']),
|
||||||
,dict(name='div', attrs={'class':['toolbar_side','graphical_feature','toolbar_bottom']})
|
{'attrs':{'class':lambda x: x and ('share-tools' in x or 'ad-zone' in x)}},
|
||||||
,dict(name='div', attrs={'id':['recent_slider','sidebar','pagination','related_media']})
|
]
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
feeds = [
|
feeds = [
|
||||||
(u'Daily' , u'http://feeds.theonion.com/theonion/daily' )
|
(u'Daily' , u'http://feeds.theonion.com/theonion/daily' )
|
||||||
,(u'Sports' , u'http://feeds.theonion.com/theonion/sports' )
|
,(u'Sports' , u'http://feeds.theonion.com/theonion/sports' )
|
||||||
]
|
]
|
||||||
|
|
||||||
def get_article_url(self, article):
|
def preprocess_html(self, soup, *args):
|
||||||
artl = BasicNewsRecipe.get_article_url(self, article)
|
for img in soup.findAll('img', attrs={'data-src':True}):
|
||||||
if artl.startswith('http://www.theonion.com/audio/'):
|
if img['data-src']:
|
||||||
artl = None
|
img['src'] = img['data-src']
|
||||||
return artl
|
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
|
||||||
for item in soup.findAll(style=True):
|
|
||||||
del item['style']
|
|
||||||
for item in soup.findAll('a'):
|
|
||||||
limg = item.find('img')
|
|
||||||
if item.string is not None:
|
|
||||||
str = item.string
|
|
||||||
item.replaceWith(str)
|
|
||||||
else:
|
|
||||||
if limg:
|
|
||||||
item.name = 'div'
|
|
||||||
item.attrs = []
|
|
||||||
if not limg.has_key('alt'):
|
|
||||||
limg['alt'] = 'image'
|
|
||||||
else:
|
|
||||||
str = self.tag_to_string(item)
|
|
||||||
item.replaceWith(str)
|
|
||||||
return soup
|
return soup
|
||||||
|
|
||||||
|
|
||||||
|
17
recipes/universe_today.recipe
Normal file
@ -0,0 +1,17 @@
|
|||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class UniverseToday(BasicNewsRecipe):
|
||||||
|
title = u'Universe Today'
|
||||||
|
language = 'en'
|
||||||
|
description = u'Space and astronomy news.'
|
||||||
|
__author__ = 'seird'
|
||||||
|
publisher = u'universetoday.com'
|
||||||
|
category = 'science, astronomy, news, rss'
|
||||||
|
oldest_article = 7
|
||||||
|
max_articles_per_feed = 40
|
||||||
|
auto_cleanup = True
|
||||||
|
no_stylesheets = True
|
||||||
|
use_embedded_content = False
|
||||||
|
remove_empty_feeds = True
|
||||||
|
|
||||||
|
feeds = [(u'Universe Today', u'http://feeds.feedburner.com/universetoday/pYdq')]
|
@ -6,17 +6,62 @@ __license__ = 'GPL v3'
|
|||||||
www.canada.com
|
www.canada.com
|
||||||
'''
|
'''
|
||||||
import re
|
import re
|
||||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
from calibre.ebooks.BeautifulSoup import Tag, BeautifulStoneSoup
|
from calibre.ebooks.BeautifulSoup import Tag, BeautifulStoneSoup
|
||||||
|
|
||||||
|
|
||||||
class TimesColonist(BasicNewsRecipe):
|
class TimesColonist(BasicNewsRecipe):
|
||||||
|
|
||||||
|
# Customization -- remove sections you don't want.
|
||||||
|
# If your e-reader is an e-ink Kindle and your output profile is
|
||||||
|
# set properly this recipe will not include images because the
|
||||||
|
# resulting file is too large. If you have one of these and want
|
||||||
|
# images you can set kindle_omit_images = False
|
||||||
|
# and remove sections (typically the e-ink Kindles will
|
||||||
|
# work with about a dozen of these, but your mileage may vary).
|
||||||
|
|
||||||
|
kindle_omit_images = True
|
||||||
|
|
||||||
|
section_list = [
|
||||||
|
('','Web Front Page'),
|
||||||
|
('news/','News Headlines'),
|
||||||
|
('news/b-c/','BC News'),
|
||||||
|
('news/national/','National News'),
|
||||||
|
('news/world/','World News'),
|
||||||
|
('opinion/','Opinion'),
|
||||||
|
('opinion/letters/','Letters'),
|
||||||
|
('business/','Business'),
|
||||||
|
('business/money/','Money'),
|
||||||
|
('business/technology/','Technology'),
|
||||||
|
('business/working/','Working'),
|
||||||
|
('sports/','Sports'),
|
||||||
|
('sports/hockey/','Hockey'),
|
||||||
|
('sports/football/','Football'),
|
||||||
|
('sports/basketball/','Basketball'),
|
||||||
|
('sports/golf/','Golf'),
|
||||||
|
('entertainment/','entertainment'),
|
||||||
|
('entertainment/go/','Go!'),
|
||||||
|
('entertainment/music/','Music'),
|
||||||
|
('entertainment/books/','Books'),
|
||||||
|
('entertainment/Movies/','Movies'),
|
||||||
|
('entertainment/television/','Television'),
|
||||||
|
('life/','Life'),
|
||||||
|
('life/health/','Health'),
|
||||||
|
('life/travel/','Travel'),
|
||||||
|
('life/driving/','Driving'),
|
||||||
|
('life/homes/','Homes'),
|
||||||
|
('life/food-drink/','Food & Drink')
|
||||||
|
]
|
||||||
|
|
||||||
title = u'Victoria Times Colonist'
|
title = u'Victoria Times Colonist'
|
||||||
url_prefix = 'http://www.timescolonist.com'
|
url_prefix = 'http://www.timescolonist.com'
|
||||||
description = u'News from Victoria, BC'
|
description = u'News from Victoria, BC'
|
||||||
fp_tag = 'CAN_TC'
|
fp_tag = 'CAN_TC'
|
||||||
|
|
||||||
|
masthead_url = 'http://www.timescolonist.com/gmg/img/global/logoTimesColonist.png'
|
||||||
|
|
||||||
|
|
||||||
url_list = []
|
url_list = []
|
||||||
language = 'en_CA'
|
language = 'en_CA'
|
||||||
__author__ = 'Nick Redding'
|
__author__ = 'Nick Redding'
|
||||||
@ -29,15 +74,21 @@ class TimesColonist(BasicNewsRecipe):
|
|||||||
.caption { font-size: xx-small; font-style: italic; font-weight: normal; }
|
.caption { font-size: xx-small; font-style: italic; font-weight: normal; }
|
||||||
'''
|
'''
|
||||||
keep_only_tags = [dict(name='div', attrs={'class':re.compile('main.content')})]
|
keep_only_tags = [dict(name='div', attrs={'class':re.compile('main.content')})]
|
||||||
remove_tags = [{'class':'comments'},
|
|
||||||
{'id':'photocredit'},
|
|
||||||
dict(name='div', attrs={'class':re.compile('top.controls')}),
|
|
||||||
dict(name='div', attrs={'class':re.compile('social')}),
|
|
||||||
dict(name='div', attrs={'class':re.compile('tools')}),
|
|
||||||
dict(name='div', attrs={'class':re.compile('bottom.tools')}),
|
|
||||||
dict(name='div', attrs={'class':re.compile('window')}),
|
|
||||||
dict(name='div', attrs={'class':re.compile('related.news.element')})]
|
|
||||||
|
|
||||||
|
def __init__(self, options, log, progress_reporter):
|
||||||
|
self.remove_tags = [{'class':'comments'},
|
||||||
|
{'id':'photocredit'},
|
||||||
|
dict(name='div', attrs={'class':re.compile('top.controls')}),
|
||||||
|
dict(name='div', attrs={'class':re.compile('^comments')}),
|
||||||
|
dict(name='div', attrs={'class':re.compile('social')}),
|
||||||
|
dict(name='div', attrs={'class':re.compile('tools')}),
|
||||||
|
dict(name='div', attrs={'class':re.compile('bottom.tools')}),
|
||||||
|
dict(name='div', attrs={'class':re.compile('window')}),
|
||||||
|
dict(name='div', attrs={'class':re.compile('related.news.element')})]
|
||||||
|
print("PROFILE NAME = "+options.output_profile.short_name)
|
||||||
|
if self.kindle_omit_images and options.output_profile.short_name in ['kindle', 'kindle_dx', 'kindle_pw']:
|
||||||
|
self.remove_tags.append(dict(name='div', attrs={'class':re.compile('image-container')}))
|
||||||
|
BasicNewsRecipe.__init__(self, options, log, progress_reporter)
|
||||||
|
|
||||||
def get_cover_url(self):
|
def get_cover_url(self):
|
||||||
from datetime import timedelta, date
|
from datetime import timedelta, date
|
||||||
@ -122,7 +173,6 @@ class TimesColonist(BasicNewsRecipe):
|
|||||||
def preprocess_html(self,soup):
|
def preprocess_html(self,soup):
|
||||||
byline = soup.find('p',attrs={'class':re.compile('ancillary')})
|
byline = soup.find('p',attrs={'class':re.compile('ancillary')})
|
||||||
if byline is not None:
|
if byline is not None:
|
||||||
byline.find('a')
|
|
||||||
authstr = self.tag_to_string(byline,False)
|
authstr = self.tag_to_string(byline,False)
|
||||||
authstr = re.sub('/ *Times Colonist','/',authstr, flags=re.IGNORECASE)
|
authstr = re.sub('/ *Times Colonist','/',authstr, flags=re.IGNORECASE)
|
||||||
authstr = re.sub('BY */','',authstr, flags=re.IGNORECASE)
|
authstr = re.sub('BY */','',authstr, flags=re.IGNORECASE)
|
||||||
@ -149,9 +199,10 @@ class TimesColonist(BasicNewsRecipe):
|
|||||||
atag = htag.a
|
atag = htag.a
|
||||||
if atag is not None:
|
if atag is not None:
|
||||||
url = atag['href']
|
url = atag['href']
|
||||||
#print("Checking "+url)
|
url = url.strip()
|
||||||
if atag['href'].startswith('/'):
|
# print("Checking >>"+url+'<<\n\r')
|
||||||
url = self.url_prefix+atag['href']
|
if url.startswith('/'):
|
||||||
|
url = self.url_prefix+url
|
||||||
if url in self.url_list:
|
if url in self.url_list:
|
||||||
return
|
return
|
||||||
self.url_list.append(url)
|
self.url_list.append(url)
|
||||||
@ -171,10 +222,10 @@ class TimesColonist(BasicNewsRecipe):
|
|||||||
if dtag is not None:
|
if dtag is not None:
|
||||||
description = self.tag_to_string(dtag,False)
|
description = self.tag_to_string(dtag,False)
|
||||||
article_list.append(dict(title=title,url=url,date='',description=description,author='',content=''))
|
article_list.append(dict(title=title,url=url,date='',description=description,author='',content=''))
|
||||||
#print(sectitle+title+": description = "+description+" URL="+url)
|
print(sectitle+title+": description = "+description+" URL="+url+'\n\r')
|
||||||
|
|
||||||
def add_section_index(self,ans,securl,sectitle):
|
def add_section_index(self,ans,securl,sectitle):
|
||||||
print("Add section url="+self.url_prefix+'/'+securl)
|
print("Add section url="+self.url_prefix+'/'+securl+'\n\r')
|
||||||
try:
|
try:
|
||||||
soup = self.index_to_soup(self.url_prefix+'/'+securl)
|
soup = self.index_to_soup(self.url_prefix+'/'+securl)
|
||||||
except:
|
except:
|
||||||
@ -193,33 +244,7 @@ class TimesColonist(BasicNewsRecipe):
|
|||||||
|
|
||||||
def parse_index(self):
|
def parse_index(self):
|
||||||
ans = []
|
ans = []
|
||||||
ans = self.add_section_index(ans,'','Web Front Page')
|
for (url,title) in self.section_list:
|
||||||
ans = self.add_section_index(ans,'news/','News Headlines')
|
ans = self.add_section_index(ans,url,title)
|
||||||
ans = self.add_section_index(ans,'news/b-c/','BC News')
|
|
||||||
ans = self.add_section_index(ans,'news/national/','Natioanl News')
|
|
||||||
ans = self.add_section_index(ans,'news/world/','World News')
|
|
||||||
ans = self.add_section_index(ans,'opinion/','Opinion')
|
|
||||||
ans = self.add_section_index(ans,'opinion/letters/','Letters')
|
|
||||||
ans = self.add_section_index(ans,'business/','Business')
|
|
||||||
ans = self.add_section_index(ans,'business/money/','Money')
|
|
||||||
ans = self.add_section_index(ans,'business/technology/','Technology')
|
|
||||||
ans = self.add_section_index(ans,'business/working/','Working')
|
|
||||||
ans = self.add_section_index(ans,'sports/','Sports')
|
|
||||||
ans = self.add_section_index(ans,'sports/hockey/','Hockey')
|
|
||||||
ans = self.add_section_index(ans,'sports/football/','Football')
|
|
||||||
ans = self.add_section_index(ans,'sports/basketball/','Basketball')
|
|
||||||
ans = self.add_section_index(ans,'sports/golf/','Golf')
|
|
||||||
ans = self.add_section_index(ans,'entertainment/','entertainment')
|
|
||||||
ans = self.add_section_index(ans,'entertainment/go/','Go!')
|
|
||||||
ans = self.add_section_index(ans,'entertainment/music/','Music')
|
|
||||||
ans = self.add_section_index(ans,'entertainment/books/','Books')
|
|
||||||
ans = self.add_section_index(ans,'entertainment/Movies/','movies')
|
|
||||||
ans = self.add_section_index(ans,'entertainment/television/','Television')
|
|
||||||
ans = self.add_section_index(ans,'life/','Life')
|
|
||||||
ans = self.add_section_index(ans,'life/health/','Health')
|
|
||||||
ans = self.add_section_index(ans,'life/travel/','Travel')
|
|
||||||
ans = self.add_section_index(ans,'life/driving/','Driving')
|
|
||||||
ans = self.add_section_index(ans,'life/homes/','Homes')
|
|
||||||
ans = self.add_section_index(ans,'life/food-drink/','Food & Drink')
|
|
||||||
return ans
|
return ans
|
||||||
|
|
||||||
|
@ -1,144 +0,0 @@
|
|||||||
#!/usr/bin/env python
|
|
||||||
|
|
||||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
|
||||||
|
|
||||||
class GazetaWyborczaDuzyForma(BasicNewsRecipe):
|
|
||||||
cover_url = 'http://bi.gazeta.pl/im/8/5415/m5415058.gif'
|
|
||||||
title = u"Gazeta Wyborcza Duzy Format"
|
|
||||||
__author__ = 'ravcio - rlelusz[at]gmail.com'
|
|
||||||
description = u"Articles from Gazeta's website"
|
|
||||||
language = 'pl'
|
|
||||||
max_articles_per_feed = 50 #you can increade it event up to maybe 600, should still work
|
|
||||||
recursions = 0
|
|
||||||
encoding = 'iso-8859-2'
|
|
||||||
no_stylesheets = True
|
|
||||||
remove_javascript = True
|
|
||||||
use_embedded_content = False
|
|
||||||
|
|
||||||
|
|
||||||
keep_only_tags = [
|
|
||||||
dict(name='div', attrs={'id':['k1']})
|
|
||||||
]
|
|
||||||
|
|
||||||
remove_tags = [
|
|
||||||
dict(name='div', attrs={'class':['zdjM', 'rel_video', 'zdjP', 'rel_box', 'index mod_zi_dolStrony']})
|
|
||||||
,dict(name='div', attrs={'id':['source', 'banP4', 'article_toolbar', 'rel', 'inContext_disabled']})
|
|
||||||
,dict(name='ul', attrs={'id':['articleToolbar']})
|
|
||||||
,dict(name='img', attrs={'class':['brand']})
|
|
||||||
,dict(name='h5', attrs={'class':['author']})
|
|
||||||
,dict(name='h6', attrs={'class':['date']})
|
|
||||||
,dict(name='p', attrs={'class':['txt_upl']})
|
|
||||||
]
|
|
||||||
|
|
||||||
remove_tags_after = [
|
|
||||||
dict(name='div', attrs={'id':['Str']}) #nawigator numerow linii
|
|
||||||
]
|
|
||||||
|
|
||||||
def load_article_links(self, url, count):
|
|
||||||
print '--- load_article_links', url, count
|
|
||||||
|
|
||||||
#page with link to articles
|
|
||||||
soup = self.index_to_soup(url)
|
|
||||||
|
|
||||||
#table with articles
|
|
||||||
list = soup.find('div', attrs={'class':'GWdalt'})
|
|
||||||
|
|
||||||
#single articles (link, title, ...)
|
|
||||||
links = list.findAll('div', attrs={'class':['GWdaltE']})
|
|
||||||
|
|
||||||
if len(links) < count:
|
|
||||||
#load links to more articles...
|
|
||||||
|
|
||||||
#remove new link
|
|
||||||
pages_nav = list.find('div', attrs={'class':'pages'})
|
|
||||||
next = pages_nav.find('a', attrs={'class':'next'})
|
|
||||||
if next:
|
|
||||||
print 'next=', next['href']
|
|
||||||
url = 'http://wyborcza.pl' + next['href']
|
|
||||||
#e.g. url = 'http://wyborcza.pl/0,75480.html?str=2'
|
|
||||||
|
|
||||||
older_links = self.load_article_links(url, count - len(links))
|
|
||||||
links.extend(older_links)
|
|
||||||
|
|
||||||
return links
|
|
||||||
|
|
||||||
|
|
||||||
#produce list of articles to download
|
|
||||||
def parse_index(self):
|
|
||||||
print '--- parse_index'
|
|
||||||
|
|
||||||
max_articles = 8000
|
|
||||||
links = self.load_article_links('http://wyborcza.pl/0,75480.html', max_articles)
|
|
||||||
|
|
||||||
ans = []
|
|
||||||
key = None
|
|
||||||
articles = {}
|
|
||||||
|
|
||||||
key = 'Uncategorized'
|
|
||||||
articles[key] = []
|
|
||||||
|
|
||||||
for div_art in links:
|
|
||||||
div_date = div_art.find('div', attrs={'class':'kL'})
|
|
||||||
div = div_art.find('div', attrs={'class':'kR'})
|
|
||||||
|
|
||||||
a = div.find('a', href=True)
|
|
||||||
|
|
||||||
url = a['href']
|
|
||||||
title = a.string
|
|
||||||
description = ''
|
|
||||||
pubdate = div_date.string.rstrip().lstrip()
|
|
||||||
summary = div.find('span', attrs={'class':'lead'})
|
|
||||||
|
|
||||||
desc = summary.find('a', href=True)
|
|
||||||
if desc:
|
|
||||||
desc.extract()
|
|
||||||
|
|
||||||
description = self.tag_to_string(summary, use_alt=False)
|
|
||||||
description = description.rstrip().lstrip()
|
|
||||||
|
|
||||||
feed = key if key is not None else 'Duzy Format'
|
|
||||||
|
|
||||||
if not articles.has_key(feed):
|
|
||||||
articles[feed] = []
|
|
||||||
|
|
||||||
if description != '': # skip just pictures atricle
|
|
||||||
articles[feed].append(
|
|
||||||
dict(title=title, url=url, date=pubdate,
|
|
||||||
description=description,
|
|
||||||
content=''))
|
|
||||||
|
|
||||||
ans = [(key, articles[key])]
|
|
||||||
return ans
|
|
||||||
|
|
||||||
def append_page(self, soup, appendtag, position):
|
|
||||||
pager = soup.find('div',attrs={'id':'Str'})
|
|
||||||
if pager:
|
|
||||||
#seek for 'a' element with nast value (if not found exit)
|
|
||||||
list = pager.findAll('a')
|
|
||||||
|
|
||||||
for elem in list:
|
|
||||||
if 'nast' in elem.string:
|
|
||||||
nexturl = elem['href']
|
|
||||||
|
|
||||||
soup2 = self.index_to_soup('http://warszawa.gazeta.pl' + nexturl)
|
|
||||||
|
|
||||||
texttag = soup2.find('div', attrs={'id':'artykul'})
|
|
||||||
|
|
||||||
newpos = len(texttag.contents)
|
|
||||||
self.append_page(soup2,texttag,newpos)
|
|
||||||
texttag.extract()
|
|
||||||
appendtag.insert(position,texttag)
|
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
|
||||||
self.append_page(soup, soup.body, 3)
|
|
||||||
|
|
||||||
# finally remove some tags
|
|
||||||
pager = soup.find('div',attrs={'id':'Str'})
|
|
||||||
if pager:
|
|
||||||
pager.extract()
|
|
||||||
|
|
||||||
pager = soup.find('div',attrs={'class':'tylko_int'})
|
|
||||||
if pager:
|
|
||||||
pager.extract()
|
|
||||||
|
|
||||||
return soup
|
|
57
recipes/wysokie_obcasy.recipe
Normal file
@ -0,0 +1,57 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
__license__ = 'GPL v3'
|
||||||
|
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class WysokieObcasyRecipe(BasicNewsRecipe):
|
||||||
|
__author__ = u'Artur Stachecki <artur.stachecki@gmail.com>'
|
||||||
|
language = 'pl'
|
||||||
|
version = 1
|
||||||
|
|
||||||
|
title = u'Wysokie Obcasy'
|
||||||
|
publisher = 'Agora SA'
|
||||||
|
description = u'Serwis sobotniego dodatku do Gazety Wyborczej'
|
||||||
|
category='magazine'
|
||||||
|
language = 'pl'
|
||||||
|
publication_type = 'magazine'
|
||||||
|
cover_url=''
|
||||||
|
remove_empty_feeds= True
|
||||||
|
no_stylesheets=True
|
||||||
|
oldest_article = 7
|
||||||
|
max_articles_per_feed = 100000
|
||||||
|
recursions = 0
|
||||||
|
|
||||||
|
no_stylesheets = True
|
||||||
|
remove_javascript = True
|
||||||
|
simultaneous_downloads = 5
|
||||||
|
|
||||||
|
keep_only_tags =[]
|
||||||
|
keep_only_tags.append(dict(name = 'div', attrs = {'id' : 'article'}))
|
||||||
|
|
||||||
|
remove_tags =[]
|
||||||
|
remove_tags.append(dict(name = 'img'))
|
||||||
|
remove_tags.append(dict(name = 'p', attrs = {'class' : 'info'}))
|
||||||
|
|
||||||
|
extra_css = '''
|
||||||
|
body {font-family: verdana, arial, helvetica, geneva, sans-serif ;}
|
||||||
|
h1{text-align: left;}
|
||||||
|
'''
|
||||||
|
|
||||||
|
feeds = [
|
||||||
|
('Wszystkie Artykuly', 'feed://www.wysokieobcasy.pl/pub/rss/wysokieobcasy.xml'),
|
||||||
|
]
|
||||||
|
|
||||||
|
def print_version(self,url):
|
||||||
|
baseURL='http://www.wysokieobcasy.pl/wysokie-obcasy'
|
||||||
|
segments = url.split(',')
|
||||||
|
subPath= '/2029020,'
|
||||||
|
articleURL1 = segments[1]
|
||||||
|
articleURL2 = segments[2]
|
||||||
|
printVerString=articleURL1 + ',' + articleURL2
|
||||||
|
s= baseURL + subPath + printVerString + '.html'
|
||||||
|
return s
|
||||||
|
|
||||||
|
def get_cover_url(self):
|
||||||
|
soup = self.index_to_soup('http://www.wysokieobcasy.pl/wysokie-obcasy/0,0.html')
|
||||||
|
self.cover_url = soup.find(attrs={'class':'holder_cr'}).find('img')['src']
|
||||||
|
return getattr(self, 'cover_url', self.cover_url)
|
@ -357,7 +357,7 @@
|
|||||||
<xsl:apply-templates/>
|
<xsl:apply-templates/>
|
||||||
</xsl:template>
|
</xsl:template>
|
||||||
|
|
||||||
<xsl:template match="rtf:table">
|
<xsl:template match="rtf:table">
|
||||||
<xsl:element name="table">
|
<xsl:element name="table">
|
||||||
<xsl:attribute name="id">
|
<xsl:attribute name="id">
|
||||||
<xsl:value-of select="generate-id(.)"/>
|
<xsl:value-of select="generate-id(.)"/>
|
||||||
@ -390,7 +390,6 @@
|
|||||||
|
|
||||||
|
|
||||||
<xsl:output method = "xml"/>
|
<xsl:output method = "xml"/>
|
||||||
|
|
||||||
<xsl:key name="style-types" match="rtf:paragraph-definition" use="@style-number"/>
|
<xsl:key name="style-types" match="rtf:paragraph-definition" use="@style-number"/>
|
||||||
|
|
||||||
|
|
||||||
@ -415,13 +414,11 @@
|
|||||||
</xsl:template>
|
</xsl:template>
|
||||||
|
|
||||||
<xsl:template match="rtf:page-break">
|
<xsl:template match="rtf:page-break">
|
||||||
<xsl:element name="br">
|
<br style = "page-break-after:always"/>
|
||||||
<xsl:attribute name="style">page-break-after:always</xsl:attribute>
|
|
||||||
</xsl:element>
|
|
||||||
</xsl:template>
|
</xsl:template>
|
||||||
|
|
||||||
<xsl:template match="rtf:hardline-break">
|
<xsl:template match="rtf:hardline-break">
|
||||||
<xsl:element name="br"/>
|
<br/>
|
||||||
</xsl:template>
|
</xsl:template>
|
||||||
|
|
||||||
<xsl:template match="rtf:rtf-definition|rtf:font-table|rtf:color-table|rtf:style-table|rtf:page-definition|rtf:list-table|rtf:override-table|rtf:override-list|rtf:list-text"/>
|
<xsl:template match="rtf:rtf-definition|rtf:font-table|rtf:color-table|rtf:style-table|rtf:page-definition|rtf:list-table|rtf:override-table|rtf:override-list|rtf:list-text"/>
|
||||||
@ -445,7 +442,7 @@
|
|||||||
</xsl:template>
|
</xsl:template>
|
||||||
|
|
||||||
<xsl:template match = "rtf:field-block">
|
<xsl:template match = "rtf:field-block">
|
||||||
<xsl:apply-templates/>
|
<xsl:apply-templates/>
|
||||||
</xsl:template>
|
</xsl:template>
|
||||||
|
|
||||||
<xsl:template match = "rtf:field[@type='hyperlink']">
|
<xsl:template match = "rtf:field[@type='hyperlink']">
|
||||||
@ -472,9 +469,7 @@
|
|||||||
</xsl:template>
|
</xsl:template>
|
||||||
|
|
||||||
<xsl:template match="rtf:pict">
|
<xsl:template match="rtf:pict">
|
||||||
<xsl:element name="img">
|
<img src = "{@num}"/>
|
||||||
<xsl:attribute name="src"><xsl:value-of select="@num" /></xsl:attribute>
|
|
||||||
</xsl:element>
|
|
||||||
</xsl:template>
|
</xsl:template>
|
||||||
|
|
||||||
<xsl:template match="*">
|
<xsl:template match="*">
|
||||||
|
@ -4,7 +4,7 @@ __license__ = 'GPL v3'
|
|||||||
__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
|
__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
|
||||||
__docformat__ = 'restructuredtext en'
|
__docformat__ = 'restructuredtext en'
|
||||||
__appname__ = u'calibre'
|
__appname__ = u'calibre'
|
||||||
numeric_version = (0, 9, 25)
|
numeric_version = (0, 9, 26)
|
||||||
__version__ = u'.'.join(map(unicode, numeric_version))
|
__version__ = u'.'.join(map(unicode, numeric_version))
|
||||||
__author__ = u"Kovid Goyal <kovid@kovidgoyal.net>"
|
__author__ = u"Kovid Goyal <kovid@kovidgoyal.net>"
|
||||||
|
|
||||||
|
@ -757,9 +757,10 @@ from calibre.ebooks.metadata.sources.isbndb import ISBNDB
|
|||||||
from calibre.ebooks.metadata.sources.overdrive import OverDrive
|
from calibre.ebooks.metadata.sources.overdrive import OverDrive
|
||||||
from calibre.ebooks.metadata.sources.douban import Douban
|
from calibre.ebooks.metadata.sources.douban import Douban
|
||||||
from calibre.ebooks.metadata.sources.ozon import Ozon
|
from calibre.ebooks.metadata.sources.ozon import Ozon
|
||||||
# from calibre.ebooks.metadata.sources.google_images import GoogleImages
|
from calibre.ebooks.metadata.sources.google_images import GoogleImages
|
||||||
|
from calibre.ebooks.metadata.sources.big_book_search import BigBookSearch
|
||||||
|
|
||||||
plugins += [GoogleBooks, Amazon, Edelweiss, OpenLibrary, ISBNDB, OverDrive, Douban, Ozon]
|
plugins += [GoogleBooks, GoogleImages, Amazon, Edelweiss, OpenLibrary, ISBNDB, OverDrive, Douban, Ozon, BigBookSearch]
|
||||||
|
|
||||||
# }}}
|
# }}}
|
||||||
|
|
||||||
|
@ -91,7 +91,7 @@ def restore_plugin_state_to_default(plugin_or_name):
|
|||||||
config['enabled_plugins'] = ep
|
config['enabled_plugins'] = ep
|
||||||
|
|
||||||
default_disabled_plugins = set([
|
default_disabled_plugins = set([
|
||||||
'Overdrive', 'Douban Books', 'OZON.ru', 'Edelweiss', 'Google Images',
|
'Overdrive', 'Douban Books', 'OZON.ru', 'Edelweiss', 'Google Images', 'Big Book Search',
|
||||||
])
|
])
|
||||||
|
|
||||||
def is_disabled(plugin):
|
def is_disabled(plugin):
|
||||||
|
@ -132,7 +132,7 @@ class Worker(Thread): # Get details {{{
|
|||||||
text()="Détails sur le produit" or \
|
text()="Détails sur le produit" or \
|
||||||
text()="Detalles del producto" or \
|
text()="Detalles del producto" or \
|
||||||
text()="Detalhes do produto" or \
|
text()="Detalhes do produto" or \
|
||||||
text()="登録情報"]/../div[@class="content"]
|
starts-with(text(), "登録情報")]/../div[@class="content"]
|
||||||
'''
|
'''
|
||||||
# Editor: is for Spanish
|
# Editor: is for Spanish
|
||||||
self.publisher_xpath = '''
|
self.publisher_xpath = '''
|
||||||
@ -235,6 +235,12 @@ class Worker(Thread): # Get details {{{
|
|||||||
msg = 'Failed to parse amazon details page: %r'%self.url
|
msg = 'Failed to parse amazon details page: %r'%self.url
|
||||||
self.log.exception(msg)
|
self.log.exception(msg)
|
||||||
return
|
return
|
||||||
|
if self.domain == 'jp':
|
||||||
|
for a in root.xpath('//a[@href]'):
|
||||||
|
if 'black-curtain-redirect.html' in a.get('href'):
|
||||||
|
self.url = 'http://amazon.co.jp'+a.get('href')
|
||||||
|
self.log('Black curtain redirect found, following')
|
||||||
|
return self.get_details()
|
||||||
|
|
||||||
errmsg = root.xpath('//*[@id="errorMessage"]')
|
errmsg = root.xpath('//*[@id="errorMessage"]')
|
||||||
if errmsg:
|
if errmsg:
|
||||||
@ -252,8 +258,8 @@ class Worker(Thread): # Get details {{{
|
|||||||
self.log.exception('Error parsing asin for url: %r'%self.url)
|
self.log.exception('Error parsing asin for url: %r'%self.url)
|
||||||
asin = None
|
asin = None
|
||||||
if self.testing:
|
if self.testing:
|
||||||
import tempfile
|
import tempfile, uuid
|
||||||
with tempfile.NamedTemporaryFile(prefix=asin + '_',
|
with tempfile.NamedTemporaryFile(prefix=(asin or str(uuid.uuid4()))+ '_',
|
||||||
suffix='.html', delete=False) as f:
|
suffix='.html', delete=False) as f:
|
||||||
f.write(raw)
|
f.write(raw)
|
||||||
print ('Downloaded html for', asin, 'saved in', f.name)
|
print ('Downloaded html for', asin, 'saved in', f.name)
|
||||||
@ -499,7 +505,7 @@ class Worker(Thread): # Get details {{{
|
|||||||
def parse_language(self, pd):
|
def parse_language(self, pd):
|
||||||
for x in reversed(pd.xpath(self.language_xpath)):
|
for x in reversed(pd.xpath(self.language_xpath)):
|
||||||
if x.tail:
|
if x.tail:
|
||||||
raw = x.tail.strip()
|
raw = x.tail.strip().partition(',')[0].strip()
|
||||||
ans = self.lang_map.get(raw, None)
|
ans = self.lang_map.get(raw, None)
|
||||||
if ans:
|
if ans:
|
||||||
return ans
|
return ans
|
||||||
@ -1004,6 +1010,11 @@ if __name__ == '__main__': # tests {{{
|
|||||||
] # }}}
|
] # }}}
|
||||||
|
|
||||||
jp_tests = [ # {{{
|
jp_tests = [ # {{{
|
||||||
|
( # Adult filtering test
|
||||||
|
{'identifiers':{'isbn':'4799500066'}},
|
||||||
|
[title_test(u'Bitch Trap'),]
|
||||||
|
),
|
||||||
|
|
||||||
( # isbn -> title, authors
|
( # isbn -> title, authors
|
||||||
{'identifiers':{'isbn': '9784101302720' }},
|
{'identifiers':{'isbn': '9784101302720' }},
|
||||||
[title_test(u'精霊の守り人',
|
[title_test(u'精霊の守り人',
|
||||||
|
@ -31,7 +31,7 @@ msprefs.defaults['find_first_edition_date'] = False
|
|||||||
# Google covers are often poor quality (scans/errors) but they have high
|
# Google covers are often poor quality (scans/errors) but they have high
|
||||||
# resolution, so they trump covers from better sources. So make sure they
|
# resolution, so they trump covers from better sources. So make sure they
|
||||||
# are only used if no other covers are found.
|
# are only used if no other covers are found.
|
||||||
msprefs.defaults['cover_priorities'] = {'Google':2, 'Google Images':2}
|
msprefs.defaults['cover_priorities'] = {'Google':2, 'Google Images':2, 'Big Book Search':2}
|
||||||
|
|
||||||
def create_log(ostream=None):
|
def create_log(ostream=None):
|
||||||
from calibre.utils.logging import ThreadSafeLog, FileStream
|
from calibre.utils.logging import ThreadSafeLog, FileStream
|
||||||
@ -429,6 +429,40 @@ class Source(Plugin):
|
|||||||
mi.tags = list(map(fixcase, mi.tags))
|
mi.tags = list(map(fixcase, mi.tags))
|
||||||
mi.isbn = check_isbn(mi.isbn)
|
mi.isbn = check_isbn(mi.isbn)
|
||||||
|
|
||||||
|
def download_multiple_covers(self, title, authors, urls, get_best_cover, timeout, result_queue, abort, log, prefs_name='max_covers'):
|
||||||
|
if not urls:
|
||||||
|
log('No images found for, title: %r and authors: %r'%(title, authors))
|
||||||
|
return
|
||||||
|
from threading import Thread
|
||||||
|
import time
|
||||||
|
if prefs_name:
|
||||||
|
urls = urls[:self.prefs[prefs_name]]
|
||||||
|
if get_best_cover:
|
||||||
|
urls = urls[:1]
|
||||||
|
log('Downloading %d covers'%len(urls))
|
||||||
|
workers = [Thread(target=self.download_image, args=(u, timeout, log, result_queue)) for u in urls]
|
||||||
|
for w in workers:
|
||||||
|
w.daemon = True
|
||||||
|
w.start()
|
||||||
|
alive = True
|
||||||
|
start_time = time.time()
|
||||||
|
while alive and not abort.is_set() and time.time() - start_time < timeout:
|
||||||
|
alive = False
|
||||||
|
for w in workers:
|
||||||
|
if w.is_alive():
|
||||||
|
alive = True
|
||||||
|
break
|
||||||
|
abort.wait(0.1)
|
||||||
|
|
||||||
|
def download_image(self, url, timeout, log, result_queue):
|
||||||
|
try:
|
||||||
|
ans = self.browser.open_novisit(url, timeout=timeout).read()
|
||||||
|
result_queue.put((self, ans))
|
||||||
|
log('Downloaded cover from: %s'%url)
|
||||||
|
except Exception:
|
||||||
|
self.log.exception('Failed to download cover from: %r'%url)
|
||||||
|
|
||||||
|
|
||||||
# }}}
|
# }}}
|
||||||
|
|
||||||
# Metadata API {{{
|
# Metadata API {{{
|
||||||
|
58
src/calibre/ebooks/metadata/sources/big_book_search.py
Normal file
@ -0,0 +1,58 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
# vim:fileencoding=UTF-8
|
||||||
|
from __future__ import (unicode_literals, division, absolute_import,
|
||||||
|
print_function)
|
||||||
|
|
||||||
|
__license__ = 'GPL v3'
|
||||||
|
__copyright__ = '2013, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||||
|
__docformat__ = 'restructuredtext en'
|
||||||
|
|
||||||
|
from calibre.ebooks.metadata.sources.base import Source, Option
|
||||||
|
|
||||||
|
def get_urls(br, tokens):
|
||||||
|
from urllib import quote_plus
|
||||||
|
from mechanize import Request
|
||||||
|
from lxml import html
|
||||||
|
escaped = [quote_plus(x.encode('utf-8')) for x in tokens if x and x.strip()]
|
||||||
|
q = b'+'.join(escaped)
|
||||||
|
url = 'http://bigbooksearch.com/books/'+q
|
||||||
|
br.open(url).read()
|
||||||
|
req = Request('http://bigbooksearch.com/query.php?SearchIndex=books&Keywords=%s&ItemPage=1'%q)
|
||||||
|
req.add_header('X-Requested-With', 'XMLHttpRequest')
|
||||||
|
req.add_header('Referer', url)
|
||||||
|
raw = br.open(req).read()
|
||||||
|
root = html.fromstring(raw.decode('utf-8'))
|
||||||
|
urls = [i.get('src') for i in root.xpath('//img[@src]')]
|
||||||
|
return urls
|
||||||
|
|
||||||
|
class BigBookSearch(Source):
|
||||||
|
|
||||||
|
name = 'Big Book Search'
|
||||||
|
description = _('Downloads multiple book covers from Amazon. Useful to find alternate covers.')
|
||||||
|
capabilities = frozenset(['cover'])
|
||||||
|
config_help_message = _('Configure the Big Book Search plugin')
|
||||||
|
can_get_multiple_covers = True
|
||||||
|
options = (Option('max_covers', 'number', 5, _('Maximum number of covers to get'),
|
||||||
|
_('The maximum number of covers to process from the search result')),
|
||||||
|
)
|
||||||
|
supports_gzip_transfer_encoding = True
|
||||||
|
|
||||||
|
def download_cover(self, log, result_queue, abort,
|
||||||
|
title=None, authors=None, identifiers={}, timeout=30, get_best_cover=False):
|
||||||
|
if not title:
|
||||||
|
return
|
||||||
|
br = self.browser
|
||||||
|
tokens = tuple(self.get_title_tokens(title)) + tuple(self.get_author_tokens(authors))
|
||||||
|
urls = get_urls(br, tokens)
|
||||||
|
self.download_multiple_covers(title, authors, urls, get_best_cover, timeout, result_queue, abort, log)
|
||||||
|
|
||||||
|
def test():
|
||||||
|
from calibre import browser
|
||||||
|
import pprint
|
||||||
|
br = browser()
|
||||||
|
urls = get_urls(br, ['consider', 'phlebas', 'banks'])
|
||||||
|
pprint.pprint(urls)
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
test()
|
||||||
|
|
@ -18,12 +18,13 @@ from calibre.utils.magick.draw import Image, save_cover_data_to
|
|||||||
|
|
||||||
class Worker(Thread):
|
class Worker(Thread):
|
||||||
|
|
||||||
def __init__(self, plugin, abort, title, authors, identifiers, timeout, rq):
|
def __init__(self, plugin, abort, title, authors, identifiers, timeout, rq, get_best_cover=False):
|
||||||
Thread.__init__(self)
|
Thread.__init__(self)
|
||||||
self.daemon = True
|
self.daemon = True
|
||||||
|
|
||||||
self.plugin = plugin
|
self.plugin = plugin
|
||||||
self.abort = abort
|
self.abort = abort
|
||||||
|
self.get_best_cover = get_best_cover
|
||||||
self.buf = BytesIO()
|
self.buf = BytesIO()
|
||||||
self.log = create_log(self.buf)
|
self.log = create_log(self.buf)
|
||||||
self.title, self.authors, self.identifiers = (title, authors,
|
self.title, self.authors, self.identifiers = (title, authors,
|
||||||
@ -37,7 +38,7 @@ class Worker(Thread):
|
|||||||
try:
|
try:
|
||||||
if self.plugin.can_get_multiple_covers:
|
if self.plugin.can_get_multiple_covers:
|
||||||
self.plugin.download_cover(self.log, self.rq, self.abort,
|
self.plugin.download_cover(self.log, self.rq, self.abort,
|
||||||
title=self.title, authors=self.authors, get_best_cover=True,
|
title=self.title, authors=self.authors, get_best_cover=self.get_best_cover,
|
||||||
identifiers=self.identifiers, timeout=self.timeout)
|
identifiers=self.identifiers, timeout=self.timeout)
|
||||||
else:
|
else:
|
||||||
self.plugin.download_cover(self.log, self.rq, self.abort,
|
self.plugin.download_cover(self.log, self.rq, self.abort,
|
||||||
@ -72,7 +73,7 @@ def process_result(log, result):
|
|||||||
return (plugin, width, height, fmt, data)
|
return (plugin, width, height, fmt, data)
|
||||||
|
|
||||||
def run_download(log, results, abort,
|
def run_download(log, results, abort,
|
||||||
title=None, authors=None, identifiers={}, timeout=30):
|
title=None, authors=None, identifiers={}, timeout=30, get_best_cover=False):
|
||||||
'''
|
'''
|
||||||
Run the cover download, putting results into the queue :param:`results`.
|
Run the cover download, putting results into the queue :param:`results`.
|
||||||
|
|
||||||
@ -89,7 +90,7 @@ def run_download(log, results, abort,
|
|||||||
plugins = [p for p in metadata_plugins(['cover']) if p.is_configured()]
|
plugins = [p for p in metadata_plugins(['cover']) if p.is_configured()]
|
||||||
|
|
||||||
rq = Queue()
|
rq = Queue()
|
||||||
workers = [Worker(p, abort, title, authors, identifiers, timeout, rq) for p
|
workers = [Worker(p, abort, title, authors, identifiers, timeout, rq, get_best_cover=get_best_cover) for p
|
||||||
in plugins]
|
in plugins]
|
||||||
for w in workers:
|
for w in workers:
|
||||||
w.start()
|
w.start()
|
||||||
@ -163,7 +164,7 @@ def download_cover(log,
|
|||||||
abort = Event()
|
abort = Event()
|
||||||
|
|
||||||
run_download(log, rq, abort, title=title, authors=authors,
|
run_download(log, rq, abort, title=title, authors=authors,
|
||||||
identifiers=identifiers, timeout=timeout)
|
identifiers=identifiers, timeout=timeout, get_best_cover=True)
|
||||||
|
|
||||||
results = []
|
results = []
|
||||||
|
|
||||||
|
@ -106,6 +106,8 @@ class Worker(Thread): # {{{
|
|||||||
parts = pub.partition(':')[0::2]
|
parts = pub.partition(':')[0::2]
|
||||||
pub = parts[1] or parts[0]
|
pub = parts[1] or parts[0]
|
||||||
try:
|
try:
|
||||||
|
if ', Ship Date:' in pub:
|
||||||
|
pub = pub.partition(', Ship Date:')[0]
|
||||||
q = parse_only_date(pub, assume_utc=True)
|
q = parse_only_date(pub, assume_utc=True)
|
||||||
if q.year != UNDEFINED_DATE:
|
if q.year != UNDEFINED_DATE:
|
||||||
mi.pubdate = q
|
mi.pubdate = q
|
||||||
|
@ -39,39 +39,11 @@ class GoogleImages(Source):
|
|||||||
title=None, authors=None, identifiers={}, timeout=30, get_best_cover=False):
|
title=None, authors=None, identifiers={}, timeout=30, get_best_cover=False):
|
||||||
if not title:
|
if not title:
|
||||||
return
|
return
|
||||||
from threading import Thread
|
|
||||||
import time
|
|
||||||
timeout = max(60, timeout) # Needs at least a minute
|
timeout = max(60, timeout) # Needs at least a minute
|
||||||
title = ' '.join(self.get_title_tokens(title))
|
title = ' '.join(self.get_title_tokens(title))
|
||||||
author = ' '.join(self.get_author_tokens(authors))
|
author = ' '.join(self.get_author_tokens(authors))
|
||||||
urls = self.get_image_urls(title, author, log, abort, timeout)
|
urls = self.get_image_urls(title, author, log, abort, timeout)
|
||||||
if not urls:
|
self.download_multiple_covers(title, authors, urls, get_best_cover, timeout, result_queue, abort, log)
|
||||||
log('No images found in Google for, title: %r and authors: %r'%(title, author))
|
|
||||||
return
|
|
||||||
urls = urls[:self.prefs['max_covers']]
|
|
||||||
if get_best_cover:
|
|
||||||
urls = urls[:1]
|
|
||||||
workers = [Thread(target=self.download_image, args=(url, timeout, log, result_queue)) for url in urls]
|
|
||||||
for w in workers:
|
|
||||||
w.daemon = True
|
|
||||||
w.start()
|
|
||||||
alive = True
|
|
||||||
start_time = time.time()
|
|
||||||
while alive and not abort.is_set() and time.time() - start_time < timeout:
|
|
||||||
alive = False
|
|
||||||
for w in workers:
|
|
||||||
if w.is_alive():
|
|
||||||
alive = True
|
|
||||||
break
|
|
||||||
abort.wait(0.1)
|
|
||||||
|
|
||||||
def download_image(self, url, timeout, log, result_queue):
|
|
||||||
try:
|
|
||||||
ans = self.browser.open_novisit(url, timeout=timeout).read()
|
|
||||||
result_queue.put((self, ans))
|
|
||||||
log('Downloaded cover from: %s'%url)
|
|
||||||
except Exception:
|
|
||||||
self.log.exception('Failed to download cover from: %r'%url)
|
|
||||||
|
|
||||||
def get_image_urls(self, title, author, log, abort, timeout):
|
def get_image_urls(self, title, author, log, abort, timeout):
|
||||||
from calibre.utils.ipc.simple_worker import fork_job, WorkerError
|
from calibre.utils.ipc.simple_worker import fork_job, WorkerError
|
||||||
|
@ -180,5 +180,6 @@ class BorderParse:
|
|||||||
elif 'single' in border_style_list:
|
elif 'single' in border_style_list:
|
||||||
new_border_dict[att] = 'single'
|
new_border_dict[att] = 'single'
|
||||||
else:
|
else:
|
||||||
new_border_dict[att] = border_style_list[0]
|
if border_style_list:
|
||||||
|
new_border_dict[att] = border_style_list[0]
|
||||||
return new_border_dict
|
return new_border_dict
|
||||||
|
@ -559,11 +559,11 @@ class TOCView(QWidget): # {{{
|
|||||||
b.setToolTip(_('Remove all selected entries'))
|
b.setToolTip(_('Remove all selected entries'))
|
||||||
b.clicked.connect(self.del_items)
|
b.clicked.connect(self.del_items)
|
||||||
|
|
||||||
self.left_button = b = QToolButton(self)
|
self.right_button = b = QToolButton(self)
|
||||||
b.setIcon(QIcon(I('forward.png')))
|
b.setIcon(QIcon(I('forward.png')))
|
||||||
b.setIconSize(QSize(ICON_SIZE, ICON_SIZE))
|
b.setIconSize(QSize(ICON_SIZE, ICON_SIZE))
|
||||||
l.addWidget(b, 4, 3)
|
l.addWidget(b, 4, 3)
|
||||||
b.setToolTip(_('Unindent the current entry [Ctrl+Left]'))
|
b.setToolTip(_('Indent the current entry [Ctrl+Right]'))
|
||||||
b.clicked.connect(self.tocw.move_right)
|
b.clicked.connect(self.tocw.move_right)
|
||||||
|
|
||||||
self.down_button = b = QToolButton(self)
|
self.down_button = b = QToolButton(self)
|
||||||
|
@ -54,7 +54,7 @@ def get_parser(usage):
|
|||||||
def get_db(dbpath, options):
|
def get_db(dbpath, options):
|
||||||
global do_notify
|
global do_notify
|
||||||
if options.library_path is not None:
|
if options.library_path is not None:
|
||||||
dbpath = options.library_path
|
dbpath = os.path.expanduser(options.library_path)
|
||||||
if dbpath is None:
|
if dbpath is None:
|
||||||
raise ValueError('No saved library path, either run the GUI or use the'
|
raise ValueError('No saved library path, either run the GUI or use the'
|
||||||
' --with-library option')
|
' --with-library option')
|
||||||
|
@ -174,7 +174,13 @@ def _extractall(f, path=None, file_info=None):
|
|||||||
has_data_descriptors = header.flags & (1 << 3)
|
has_data_descriptors = header.flags & (1 << 3)
|
||||||
seekval = header.compressed_size + (16 if has_data_descriptors else 0)
|
seekval = header.compressed_size + (16 if has_data_descriptors else 0)
|
||||||
found = True
|
found = True
|
||||||
parts = header.filename.split('/')
|
# Sanitize path changing absolute to relative paths and removing .. and
|
||||||
|
# .
|
||||||
|
fname = header.filename.replace(os.sep, '/')
|
||||||
|
fname = os.path.splitdrive(fname)[1]
|
||||||
|
parts = [x for x in fname.split('/') if x not in {'', os.path.pardir, os.path.curdir}]
|
||||||
|
if not parts:
|
||||||
|
continue
|
||||||
if header.uncompressed_size == 0:
|
if header.uncompressed_size == 0:
|
||||||
# Directory
|
# Directory
|
||||||
f.seek(f.tell()+seekval)
|
f.seek(f.tell()+seekval)
|
||||||
|
@ -17,8 +17,7 @@ class MReplace(UserDict):
|
|||||||
|
|
||||||
def compile_regex(self):
|
def compile_regex(self):
|
||||||
if len(self.data) > 0:
|
if len(self.data) > 0:
|
||||||
keys = sorted(self.data.keys(), key=len)
|
keys = sorted(self.data.keys(), key=len, reverse=True)
|
||||||
keys.reverse()
|
|
||||||
tmp = "(%s)" % "|".join(map(re.escape, keys))
|
tmp = "(%s)" % "|".join(map(re.escape, keys))
|
||||||
if self.re != tmp:
|
if self.re != tmp:
|
||||||
self.re = tmp
|
self.re = tmp
|
||||||
|
@ -1099,10 +1099,13 @@ class ZipFile:
|
|||||||
|
|
||||||
base_target = targetpath # Added by Kovid
|
base_target = targetpath # Added by Kovid
|
||||||
|
|
||||||
# don't include leading "/" from file name if present
|
# Sanitize path, changing absolute paths to relative paths
|
||||||
fname = member.filename
|
# and removing .. and . (changed by Kovid)
|
||||||
if fname.startswith('/'):
|
fname = member.filename.replace(os.sep, '/')
|
||||||
fname = fname[1:]
|
fname = os.path.splitdrive(fname)[1]
|
||||||
|
fname = '/'.join(x for x in fname.split('/') if x not in {'', os.path.curdir, os.path.pardir})
|
||||||
|
if not fname:
|
||||||
|
raise BadZipfile('The member %r has an invalid name'%member.filename)
|
||||||
|
|
||||||
targetpath = os.path.normpath(os.path.join(base_target, fname))
|
targetpath = os.path.normpath(os.path.join(base_target, fname))
|
||||||
|
|
||||||
|