mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-08 18:54:09 -04:00
Sync to trunk.
This commit is contained in:
commit
cc2155e671
4210
Changelog.old.yaml
4210
Changelog.old.yaml
File diff suppressed because it is too large
Load Diff
4468
Changelog.yaml
4468
Changelog.yaml
File diff suppressed because it is too large
Load Diff
@ -10,11 +10,11 @@ class Alternet(BasicNewsRecipe):
|
||||
category = 'News, Magazine'
|
||||
description = 'News magazine and online community'
|
||||
feeds = [
|
||||
(u'Front Page', u'http://feeds.feedblitz.com/alternet'),
|
||||
(u'Breaking News', u'http://feeds.feedblitz.com/alternet_breaking_news'),
|
||||
(u'Top Ten Campaigns', u'http://feeds.feedblitz.com/alternet_top_10_campaigns'),
|
||||
(u'Special Coverage Areas', u'http://feeds.feedblitz.com/alternet_coverage')
|
||||
]
|
||||
(u'Front Page', u'http://feeds.feedblitz.com/alternet'),
|
||||
(u'Breaking News', u'http://feeds.feedblitz.com/alternet_breaking_news'),
|
||||
(u'Top Ten Campaigns', u'http://feeds.feedblitz.com/alternet_top_10_campaigns'),
|
||||
(u'Special Coverage Areas', u'http://feeds.feedblitz.com/alternet_coverage')
|
||||
]
|
||||
remove_attributes = ['width', 'align','cellspacing']
|
||||
remove_javascript = True
|
||||
use_embedded_content = False
|
||||
@ -36,3 +36,5 @@ class Alternet(BasicNewsRecipe):
|
||||
self.temp_files[-1].write(html)
|
||||
self.temp_files[-1].close()
|
||||
return self.temp_files[-1].name
|
||||
|
||||
conversion_options = {'linearize_tables': True}
|
||||
|
@ -11,7 +11,6 @@ class AssociatedPress(BasicNewsRecipe):
|
||||
language = 'en'
|
||||
no_stylesheets = True
|
||||
max_articles_per_feed = 15
|
||||
html2lrf_options = ['--force-page-break-before-tag="chapter"']
|
||||
|
||||
|
||||
preprocess_regexps = [ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in
|
||||
|
44
recipes/birmingham_post.recipe
Normal file
44
recipes/birmingham_post.recipe
Normal file
@ -0,0 +1,44 @@
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
class AdvancedUserRecipe1306097511(BasicNewsRecipe):
|
||||
title = u'Birmingham post'
|
||||
description = 'News for Birmingham UK'
|
||||
timefmt = ''
|
||||
__author__ = 'Dave Asbury'
|
||||
cover_url = 'http://1.bp.blogspot.com/_GwWyq5eGw9M/S9BHPHxW55I/AAAAAAAAB6Q/iGCWl0egGzg/s320/Birmingham+post+Lite+front.JPG'
|
||||
oldest_article = 1
|
||||
max_articles_per_feed = 20
|
||||
remove_empty_feeds = True
|
||||
remove_javascript = True
|
||||
auto_cleanup = True
|
||||
language = 'en_GB'
|
||||
|
||||
|
||||
masthead_url = 'http://www.pressgazette.co.uk/Pictures/web/t/c/g/birmingham_post.jpg'
|
||||
|
||||
|
||||
keep_only_tags = [
|
||||
#dict(name='h1',attrs={'id' : 'article-headline'}),
|
||||
#dict(attrs={'class':['article-meta-author','article-meta-date','article main','art-o art-align-center otm-1 ']}),
|
||||
#dict(name='p')
|
||||
#dict(attrs={'id' : 'three-col'})
|
||||
]
|
||||
remove_tags = [
|
||||
# dict(name='div',attrs={'class' : 'span-33 last header-links'})
|
||||
|
||||
]
|
||||
feeds = [
|
||||
#(u'News',u'http://www.birminghampost.net/news/rss.xml'),
|
||||
(u'Local News', u'http://www.birminghampost.net/news/west-midlands-news/rss.xml'),
|
||||
(u'UK News', u'http://www.birminghampost.net/news/uk-news/rss.xml'),
|
||||
(u'Sports',u'http://www.birminghampost.net/midlands-birmingham-sport/rss.xml'),
|
||||
(u'Bloggs & Comments',u'http://www.birminghampost.net/comment/rss.xml')
|
||||
|
||||
]
|
||||
extra_css = '''
|
||||
body {font: sans-serif medium;}'
|
||||
h1 {text-align : center; font-family:Arial,Helvetica,sans-serif; font-size:20px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:bold;}
|
||||
h2 {text-align : center;color:#4D4D4D;font-family:Arial,Helvetica,sans-serif; font-size:15px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:bold; }
|
||||
span{ font-size:9.5px; font-weight:bold;font-style:italic}
|
||||
p { text-align: justify; font-family:Arial,Helvetica,sans-serif; font-size:11px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:normal;}
|
||||
|
||||
'''
|
@ -1,6 +1,6 @@
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2008-2011, Darko Miletic <darko.miletic at gmail.com>'
|
||||
__copyright__ = '2008-2012, Darko Miletic <darko.miletic at gmail.com>'
|
||||
'''
|
||||
blic.rs
|
||||
'''
|
||||
@ -73,7 +73,10 @@ class Blic(BasicNewsRecipe):
|
||||
def print_version(self, url):
|
||||
return url + '/print'
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
for item in soup.findAll(style=True):
|
||||
del item['style']
|
||||
return soup
|
||||
def get_cover_url(self):
|
||||
soup = self.index_to_soup('http://www.blic.rs/')
|
||||
alink = soup.find('a', attrs={'id':'blic_naslovna_print'})
|
||||
if alink:
|
||||
return 'http://www.blic.rs' + alink['href']
|
||||
return None
|
||||
|
@ -7,6 +7,7 @@ class AdvancedUserRecipe1306097511(BasicNewsRecipe):
|
||||
description = 'Fashion, beauty and Gossip for women from COSMOPOLITAN -UK'
|
||||
|
||||
__author__ = 'Dave Asbury'
|
||||
#last update 21/12/11
|
||||
# greyscale code by Starson
|
||||
cover_url = 'http://www.cosmopolitan.magazine.co.uk/files/4613/2085/8988/Cosmo_Cover3.jpg'
|
||||
no_stylesheets = True
|
||||
@ -31,8 +32,9 @@ class AdvancedUserRecipe1306097511(BasicNewsRecipe):
|
||||
dict(name='div',attrs={'class' : ['blogInfo','viral_toolbar','comment_number','prevEntry nav']}),
|
||||
dict(name='div',attrs={'class' : 'blog_module_about_the_authors'}),
|
||||
dict(attrs={'id': ['breadcrumbs','comment','related_links_list','right_rail','content_sec_fb_more','content_sec_mostpopularstories','content-sec_fb_frame_viewfb_bot']}),
|
||||
dict(attrs={'class' : ['read_liked_that_header','fb_back_next_area']})
|
||||
]
|
||||
dict(attrs={'class' : ['read_liked_that_header','fb_back_next_area']}),
|
||||
dict(name='li',attrs={'class' : 'thumb'})
|
||||
]
|
||||
|
||||
feeds = [
|
||||
(u'Love & Sex', u'http://www.cosmopolitan.co.uk/love-sex/rss/'), (u'Men', u'http://cosmopolitan.co.uk/men/rss/'), (u'Fashion', u'http://cosmopolitan.co.uk/fashion/rss/'), (u'Hair & Beauty', u'http://cosmopolitan.co.uk/beauty-hair/rss/'), (u'LifeStyle', u'http://cosmopolitan.co.uk/lifestyle/rss/'), (u'Cosmo On Campus', u'http://cosmopolitan.co.uk/campus/rss/'), (u'Celebrity Gossip', u'http://cosmopolitan.co.uk/celebrity-gossip/rss/')]
|
||||
@ -48,4 +50,3 @@ class AdvancedUserRecipe1306097511(BasicNewsRecipe):
|
||||
img.type = "GrayscaleType"
|
||||
img.save(iurl)
|
||||
return soup
|
||||
|
||||
|
@ -5,7 +5,7 @@ class AdvancedUserRecipe1306061239(BasicNewsRecipe):
|
||||
description = 'News as provide by The Daily Mirror -UK'
|
||||
|
||||
__author__ = 'Dave Asbury'
|
||||
# last updated 30/10/11
|
||||
# last updated 26/12/11
|
||||
language = 'en_GB'
|
||||
|
||||
cover_url = 'http://yookeo.com/screens/m/i/mirror.co.uk.jpg'
|
||||
@ -13,30 +13,22 @@ class AdvancedUserRecipe1306061239(BasicNewsRecipe):
|
||||
masthead_url = 'http://www.nmauk.co.uk/nma/images/daily_mirror.gif'
|
||||
|
||||
|
||||
oldest_article = 2
|
||||
max_articles_per_feed = 30
|
||||
oldest_article = 1
|
||||
max_articles_per_feed = 20
|
||||
remove_empty_feeds = True
|
||||
remove_javascript = True
|
||||
no_stylesheets = True
|
||||
extra_css = '''
|
||||
body{ text-align: justify; font-family:Arial,Helvetica,sans-serif; font-size:11px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:normal;}
|
||||
'''
|
||||
|
||||
keep_only_tags = [
|
||||
dict(name='div',attrs={'id' : 'body-content'})
|
||||
]
|
||||
|
||||
remove_tags_after = [dict (name='div',attrs={'class' : 'related'})]
|
||||
|
||||
auto_cleanup = True
|
||||
remove_tags = [
|
||||
dict(name='div',attrs={'id' : ['sidebar','menu','search-box','roffers-top']}),
|
||||
dict(name='div',attrs={'class' :['inline-ad span-16 last','article-resize','related','list teasers']}),
|
||||
dict(attrs={'class' : ['channellink','article-tags','replace','append-html']}),
|
||||
dict(name='div',attrs={'class' : 'span-12 last sl-others addthis_toolbox addthis_default_style'})
|
||||
dict(name='title'),
|
||||
dict(name='div',attrs={'class' : ['inline-ad span-16 last','caption']}),
|
||||
]
|
||||
|
||||
preprocess_regexps = [
|
||||
(re.compile(r'<dl class="q-search">.*?</dl>', re.IGNORECASE | re.DOTALL), lambda match: '')]
|
||||
(re.compile(r'- mirror.co.uk', re.IGNORECASE | re.DOTALL), lambda match: '')]
|
||||
|
||||
preprocess_regexps = [
|
||||
(re.compile(r'Advertisement >>', re.IGNORECASE | re.DOTALL), lambda match: '')]
|
||||
|
||||
|
||||
feeds = [
|
||||
@ -53,5 +45,10 @@ class AdvancedUserRecipe1306061239(BasicNewsRecipe):
|
||||
,(u'Travel','http://www.mirror.co.uk/advice/travel/rss.xml')
|
||||
|
||||
# example of commented out feed not needed ,(u'Travel','http://www.mirror.co.uk/advice/travel/rss.xml')
|
||||
|
||||
]
|
||||
extra_css = '''
|
||||
body{ text-align: justify; font-family:Arial,Helvetica,sans-serif; font-size:11px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:normal;}
|
||||
h1{ font-size:18px;}
|
||||
img { display:block}
|
||||
'''
|
||||
|
||||
|
11
recipes/derin_dusunce.recipe
Normal file
11
recipes/derin_dusunce.recipe
Normal file
@ -0,0 +1,11 @@
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class BasicUserRecipe1324913694(BasicNewsRecipe):
|
||||
title = u'Derin Dusunce'
|
||||
language = 'tr'
|
||||
__author__ = 'asalet_r'
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 20
|
||||
auto_cleanup = True
|
||||
|
||||
feeds = [(u'Derin D\xfc\u015f\xfcnce', u'http://www.derindusunce.org/feed/')]
|
12
recipes/dunya_bizim.recipe
Normal file
12
recipes/dunya_bizim.recipe
Normal file
@ -0,0 +1,12 @@
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class BasicUserRecipe1324736687(BasicNewsRecipe):
|
||||
title = u'D\xfcnya Bizim'
|
||||
language = 'tr'
|
||||
__author__ = 'asalet_r'
|
||||
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 10
|
||||
auto_cleanup = True
|
||||
|
||||
feeds = [(u'Aktif \u0130mamlar', u'http://dunyabizim.com/servisler/rss.php?kategoriID=31'), (u'Ayr\u0131nt\u0131 Defteri', u'http://dunyabizim.com/servisler/rss.php?kategoriID=58'), (u'Baba Kitaplar', u'http://dunyabizim.com/servisler/rss.php?kategoriID=4'), (u'Bu da Oldu', u'http://dunyabizim.com/servisler/rss.php?kategoriID=32'), (u'\xc7-al\u0131nt\u0131 Yaz\u0131lar', u'http://dunyabizim.com/servisler/rss.php?kategoriID=33'), (u'Dar\xfclmedya', u'http://dunyabizim.com/servisler/rss.php?kategoriID=49'), (u'Gidenler', u'http://dunyabizim.com/servisler/rss.php?kategoriID=59'), (u'G\xfczel Mekanlar', u'http://dunyabizim.com/servisler/rss.php?kategoriID=43'), (u'\u0130yi Haberler', u'http://dunyabizim.com/servisler/rss.php?kategoriID=18'), (u'\u0130yi M\xfczikler', u'http://dunyabizim.com/servisler/rss.php?kategoriID=2'), (u'Kalite Dergiler', u'http://dunyabizim.com/servisler/rss.php?kategoriID=3'), (u'Konu\u015fa Konu\u015fa', u'http://dunyabizim.com/servisler/rss.php?kategoriID=24'), (u'M\xfcstesta G\xfczeller', u'http://dunyabizim.com/servisler/rss.php?kategoriID=65'), (u'O \u015eimdi Nerede?', u'http://dunyabizim.com/servisler/rss.php?kategoriID=52'), (u'Olsa Ke\u015fke', u'http://dunyabizim.com/servisler/rss.php?kategoriID=34'), (u'Orada Ne Oldu?', u'http://dunyabizim.com/servisler/rss.php?kategoriID=38'), (u'\xd6nemli Adamlar', u'http://dunyabizim.com/servisler/rss.php?kategoriID=1'), (u'Polemik', u'http://dunyabizim.com/servisler/rss.php?kategoriID=39'), (u'Sinema', u'http://dunyabizim.com/servisler/rss.php?kategoriID=23'), (u'Yalan Haber', u'http://dunyabizim.com/servisler/rss.php?kategoriID=40'), (u'Yeni \u015eeyler', u'http://dunyabizim.com/servisler/rss.php?kategoriID=57'), (u'Zekeriya Sofras\u0131', u'http://dunyabizim.com/servisler/rss.php?kategoriID=60')]
|
12
recipes/dunya_bulteni.recipe
Normal file
12
recipes/dunya_bulteni.recipe
Normal file
@ -0,0 +1,12 @@
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class BasicUserRecipe1321194347(BasicNewsRecipe):
|
||||
title = u'D\xfcnya B\xfclteni'
|
||||
language = 'tr'
|
||||
__author__ = 'asalet_r'
|
||||
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 50
|
||||
auto_cleanup = True
|
||||
|
||||
feeds = [(u'Tarih Dosyas\u0131', u'http://www.dunyabulteni.net/servisler/rss.php?kategoriID=157'), (u'R\xf6portaj', u'http://www.dunyabulteni.net/servisler/rss.php?kategoriID=153'), (u'Makale-Yorum', u'http://www.dunyabulteni.net/servisler/rss.php?kategoriID=174'), (u'K\xfclt\xfcr-Sanat', u'http://www.dunyabulteni.net/servisler/rss.php?kategoriID=66'), (u'Hayat\u0131n \u0130\xe7inden', u'http://www.dunyabulteni.net/servisler/rss.php?kategoriID=200'), (u'Haber Analiz', u'http://www.dunyabulteni.net/servisler/rss.php?kategoriID=123'), (u'Gezi-\u0130zlenim', u'http://www.dunyabulteni.net/servisler/rss.php?kategoriID=90'), (u'Aile Sa\u011fl\u0131k E\u011fitim', u'http://www.dunyabulteni.net/servisler/rss.php?kategoriID=75')]
|
46
recipes/echo_online.recipe
Normal file
46
recipes/echo_online.recipe
Normal file
@ -0,0 +1,46 @@
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2011, Kovid Goyal <kovid at kovidgoyal.net>, Armin Geller'
|
||||
'''
|
||||
Fetch echo-online.de
|
||||
'''
|
||||
|
||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||
class Echo_Online(BasicNewsRecipe):
|
||||
title = u'Echo Online' # 2011-12-28 AGe
|
||||
description = '-Echo Online-'
|
||||
publisher = 'Echo Online GmbH'
|
||||
category = 'News, Germany'
|
||||
__author__ = 'Armin Geller' # 2011-12-28 AGe
|
||||
language = 'de'
|
||||
lang = 'de-DE'
|
||||
encoding = 'iso-8859-1'
|
||||
timefmt = ' [%a, %d %b %Y]'
|
||||
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 50 # 2011-12-28 AGe
|
||||
no_stylesheets = True
|
||||
auto_cleanup = True
|
||||
remove_javascript = True
|
||||
|
||||
feeds = [
|
||||
(u'Topnews', u'http://www.echo-online.de/storage/rss/rss/topnews.xml'),
|
||||
(u'Darmstadt', u'http://www.echo-online.de/rss/darmstadt.xml'),
|
||||
(u'Darmstadt-Dieburg', u'http://www.echo-online.de/rss/darmstadtdieburg.xml'),
|
||||
(u'Kreis Gro\xdf-Gerau', u'http://www.echo-online.de/rss/kreisgrossgerau.xml'),
|
||||
(u'R\xfcsselsheim', u'http://www.echo-online.de/rss/ruesselsheim.xml'),
|
||||
(u'Kreis Bergstra\xdfe', u'http://www.echo-online.de/rss/bergstrasse.xml'),
|
||||
(u'Odenwaldkreis', u'http://www.echo-online.de/rss/odenwald.xml'),
|
||||
(u'SV 98', u'http://www.echo-online.de/rss/sv98.xml'),
|
||||
(u'Kino', u'http://www.echo-online.de/rss/kino.xml'),
|
||||
(u'Ausstellungen', u'http://www.echo-online.de/rss/ausstellungen.xml'),
|
||||
(u'Ausflug & Reise', u'http://www.echo-online.de/rss/ausflugreise.xml'),
|
||||
]
|
||||
|
||||
def print_version(self, url):
|
||||
return self.browser.open_novisit(url).geturl() + '?_FRAME=33&_FORMAT=PRINT'
|
||||
|
||||
remove_tags = [dict(name='div', attrs={'class':["header", "name"]}),]
|
||||
auto_cleanup_keep = '//div[@class="bild_gross w270"]'
|
||||
|
||||
cover_url = 'http://adcounter.darmstaedter-echo.de/webdav/files/config/gui/images/Zeitungsfaecher.gif'
|
||||
|
50
recipes/edge_conversations.recipe
Normal file
50
recipes/edge_conversations.recipe
Normal file
@ -0,0 +1,50 @@
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2012 Levien van Zon <levien@zonnetjes.net>'
|
||||
|
||||
'''
|
||||
Fetch Edge.org conversations
|
||||
'''
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class EdgeConversationRSS(BasicNewsRecipe):
|
||||
title = u'Edge.org Conversations'
|
||||
__author__ = 'levien'
|
||||
language = 'en'
|
||||
description = '''Edge.org offers "open-minded, free ranging, intellectually
|
||||
playful ... an unadorned pleasure in curiosity, a collective expression of
|
||||
wonder at the living and inanimate world ... an ongoing and thrilling
|
||||
colloquium.'''
|
||||
oldest_article = 60
|
||||
max_articles_per_feed = 100
|
||||
no_stylesheets = True
|
||||
|
||||
keep_only_tags = [dict(name='div', attrs={'class':'HomeLeftPannel IMGCTRL'}) ]
|
||||
remove_tags = [
|
||||
dict(name='div',attrs={'class':'Logo'})
|
||||
]
|
||||
|
||||
feeds = [(u'Edge RSS', u'http://edge.org/feeds/')]
|
||||
|
||||
def print_version(self, url):
|
||||
return url.replace('conversation/', 'conversation.php?cid=')
|
||||
|
||||
def parse_feeds(self):
|
||||
|
||||
# Call parent's method.
|
||||
feeds = BasicNewsRecipe.parse_feeds(self)
|
||||
|
||||
# Loop through all feeds.
|
||||
for feed in feeds:
|
||||
|
||||
# Loop through all articles in feed.
|
||||
for article in feed.articles[:]:
|
||||
|
||||
# Remove anything that is not a conversation, and remove PDF files as well...
|
||||
|
||||
if not ('CONVERSATION' in article.title):
|
||||
feed.articles.remove(article)
|
||||
elif 'pdf' in article.url:
|
||||
feed.articles.remove(article)
|
||||
|
||||
return feeds
|
||||
|
48
recipes/elet_es_irodalom.recipe
Normal file
48
recipes/elet_es_irodalom.recipe
Normal file
@ -0,0 +1,48 @@
|
||||
################################################################################
|
||||
#Description: http://es.hu/ RSS channel
|
||||
#Author: Bigpapa (bigpapabig@hotmail.com)
|
||||
#Date: 2010.12.01. - V1.0
|
||||
################################################################################
|
||||
|
||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||
|
||||
class elet_es_irodalom(BasicNewsRecipe):
|
||||
title = u'Elet es Irodalom'
|
||||
__author__ = 'Bigpapa'
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 20 # Az adott e-bookban tarolt cikkek feedenkenti maximalis szamat adja meg.
|
||||
no_stylesheets = True
|
||||
#delay = 1
|
||||
use_embedded_content = False
|
||||
encoding = 'iso-8859-2'
|
||||
category = 'Cikkek'
|
||||
language = 'hu'
|
||||
publication_type = 'newsportal'
|
||||
extra_css = '.doc_title { font: bold 30px } .doc_author {font: bold 14px} '
|
||||
|
||||
keep_only_tags = [
|
||||
dict(name='div', attrs={'class':['doc_author', 'doc_title', 'doc']})
|
||||
|
||||
]
|
||||
|
||||
remove_tags = [
|
||||
dict(name='a', attrs={'target':['_TOP']}),
|
||||
dict(name='div', attrs={'style':['float: right; margin-left: 5px; margin-bottom: 5px;', 'float: right; margin-left: 5px; margin-bottom: 5px;']}),
|
||||
|
||||
|
||||
]
|
||||
|
||||
|
||||
|
||||
feeds = [
|
||||
(u'Publicisztika', 'http://www.feed43.com/4684235031168504.xml'),
|
||||
(u'Interj\xfa', 'http://www.feed43.com/4032465460040618.xml'),
|
||||
(u'Visszhang', 'http://www.feed43.com/3727375706873086.xml'),
|
||||
(u'P\xe1ratlan oldal', 'http://www.feed43.com/2525784782475057.xml'),
|
||||
(u'Feuilleton', 'http://www.feed43.com/7216025082703073.xml'),
|
||||
(u'Pr\xf3za', 'http://www.feed43.com/8760248802326384.xml'),
|
||||
(u'Vers', 'http://www.feed43.com/1737324675134275.xml'),
|
||||
(u'K\xf6nyvkritika', 'http://www.feed43.com/1281156550717082.xml'),
|
||||
(u'M\u0171b\xedr\xe1lat', 'http://www.feed43.com/1851854623681044.xml')
|
||||
|
||||
]
|
@ -20,7 +20,7 @@ class ESPN(BasicNewsRecipe):
|
||||
|
||||
use_embedded_content = False
|
||||
remove_javascript = True
|
||||
needs_subscription = True
|
||||
needs_subscription = 'optional'
|
||||
encoding= 'ISO-8859-1'
|
||||
|
||||
remove_tags_before = dict(name='font', attrs={'class':'date'})
|
||||
@ -75,32 +75,30 @@ class ESPN(BasicNewsRecipe):
|
||||
|
||||
return soup
|
||||
|
||||
|
||||
|
||||
def get_browser(self):
|
||||
br = BasicNewsRecipe.get_browser()
|
||||
br.set_handle_refresh(False)
|
||||
url = ('https://r.espn.go.com/members/v3_1/login')
|
||||
raw = br.open(url).read()
|
||||
raw = re.sub(r'(?s)<form>.*?id="regsigninbtn".*?</form>', '', raw)
|
||||
with TemporaryFile(suffix='.htm') as fname:
|
||||
with open(fname, 'wb') as f:
|
||||
f.write(raw)
|
||||
br.open_local_file(fname)
|
||||
if self.username and self.password:
|
||||
br.set_handle_refresh(False)
|
||||
url = ('https://r.espn.go.com/members/v3_1/login')
|
||||
raw = br.open(url).read()
|
||||
raw = re.sub(r'(?s)<form>.*?id="regsigninbtn".*?</form>', '', raw)
|
||||
with TemporaryFile(suffix='.htm') as fname:
|
||||
with open(fname, 'wb') as f:
|
||||
f.write(raw)
|
||||
br.open_local_file(fname)
|
||||
|
||||
br.form = br.forms().next()
|
||||
br.form.find_control(name='username', type='text').value = self.username
|
||||
br.form['password'] = self.password
|
||||
br.submit().read()
|
||||
br.open('http://espn.go.com').read()
|
||||
br.set_handle_refresh(True)
|
||||
br.form = br.forms().next()
|
||||
br.form.find_control(name='username', type='text').value = self.username
|
||||
br.form['password'] = self.password
|
||||
br.submit().read()
|
||||
br.open('http://espn.go.com').read()
|
||||
br.set_handle_refresh(True)
|
||||
return br
|
||||
|
||||
def get_article_url(self, article):
|
||||
return article.get('guid', None)
|
||||
|
||||
def print_version(self, url):
|
||||
|
||||
if 'eticket' in url:
|
||||
return url.partition('&')[0].replace('story?', 'print?')
|
||||
match = re.search(r'story\?(id=\d+)', url)
|
||||
|
30
recipes/fhm_uk.recipe
Normal file
30
recipes/fhm_uk.recipe
Normal file
@ -0,0 +1,30 @@
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class AdvancedUserRecipe1325006965(BasicNewsRecipe):
|
||||
title = u'FHM UK'
|
||||
description = 'Good News for Men'
|
||||
cover_url = 'http://profile.ak.fbcdn.net/hprofile-ak-snc4/373529_38324934806_64930243_n.jpg'
|
||||
masthead_url = 'http://www.fhm.com/App_Resources/Images/Site/re-design/logo.gif'
|
||||
__author__ = 'Dave Asbury'
|
||||
# last updated 27/12/11
|
||||
language = 'en_GB'
|
||||
oldest_article = 28
|
||||
max_articles_per_feed = 12
|
||||
remove_empty_feeds = True
|
||||
no_stylesheets = True
|
||||
#auto_cleanup = True
|
||||
#articles_are_obfuscated = True
|
||||
keep_only_tags = [
|
||||
dict(name='h1'),
|
||||
dict(name='img',attrs={'id' : 'ctl00_Body_imgMainImage'}),
|
||||
dict(name='div',attrs={'id' : ['articleLeft']}),
|
||||
dict(name='div',attrs={'class' : ['imagesCenterArticle','containerCenterArticle','articleBody']}),
|
||||
|
||||
]
|
||||
|
||||
|
||||
feeds = [
|
||||
(u'From the Homepage',u'http://feed43.com/8053226782885416.xml'),
|
||||
(u'The Final Countdown', u'http://feed43.com/3576106158530118.xml'),
|
||||
(u'Gaming',u'http://feed43.com/0755006465351035.xml'),
|
||||
]
|
@ -1,4 +1,3 @@
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class GlasgowHerald(BasicNewsRecipe):
|
||||
@ -9,12 +8,16 @@ class GlasgowHerald(BasicNewsRecipe):
|
||||
language = 'en_GB'
|
||||
|
||||
__author__ = 'Kovid Goyal'
|
||||
use_embedded_content = False
|
||||
|
||||
keep_only_tags = [dict(attrs={'class':'article'})]
|
||||
remove_tags = [
|
||||
dict(id=['pic-nav']),
|
||||
dict(attrs={'class':['comments-top']})
|
||||
]
|
||||
no_stylesheets = True
|
||||
auto_cleanup = True
|
||||
|
||||
#keep_only_tags = [dict(attrs={'class':'article'})]
|
||||
#remove_tags = [
|
||||
#dict(id=['pic-nav']),
|
||||
#dict(attrs={'class':['comments-top']})
|
||||
#]
|
||||
|
||||
|
||||
feeds = [
|
||||
@ -25,5 +28,4 @@ class GlasgowHerald(BasicNewsRecipe):
|
||||
(u'Arts & Entertainment',
|
||||
u'http://www.heraldscotland.com/cmlink/1.768',),
|
||||
(u'Columnists', u'http://www.heraldscotland.com/cmlink/1.658574')]
|
||||
|
||||
|
||||
|
||||
|
13
recipes/goal.recipe
Normal file
13
recipes/goal.recipe
Normal file
@ -0,0 +1,13 @@
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class AdvancedUserRecipe1325677767(BasicNewsRecipe):
|
||||
title = u'Goal'
|
||||
oldest_article = 1
|
||||
language = 'it'
|
||||
max_articles_per_feed = 100
|
||||
auto_cleanup = True
|
||||
remove_tags_after = [dict(id='article_content')]
|
||||
feeds = [(u'Goal', u'http://www.goal.com/it/feeds/news?fmt=rss')]
|
||||
__author__ = 'faber1971'
|
||||
description = 'Sports news from Italy'
|
||||
|
106
recipes/grantland.recipe
Normal file
106
recipes/grantland.recipe
Normal file
@ -0,0 +1,106 @@
|
||||
import re
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class GrantLand(BasicNewsRecipe):
|
||||
title = u"Grantland"
|
||||
description = 'Writings on Sports & Pop Culture'
|
||||
language = 'en'
|
||||
__author__ = 'barty on mobileread.com forum'
|
||||
max_articles_per_feed = 100
|
||||
no_stylesheets = False
|
||||
# auto_cleanup is too aggressive sometimes and we end up with blank articles
|
||||
auto_cleanup = False
|
||||
timefmt = ' [%a, %d %b %Y]'
|
||||
oldest_article = 365
|
||||
|
||||
cover_url = 'http://cdn0.sbnation.com/imported_assets/740965/blog_grantland_grid_3.jpg'
|
||||
masthead_url = 'http://a1.espncdn.com/prod/assets/grantland/grantland-logo.jpg'
|
||||
|
||||
INDEX = 'http://www.grantland.com'
|
||||
CATEGORIES = [
|
||||
# comment out categories you don't want
|
||||
# (user friendly name, url suffix, max number of articles to load)
|
||||
('Today in Grantland','',20),
|
||||
('In Case You Missed It','incaseyoumissedit',35),
|
||||
]
|
||||
|
||||
remove_tags = [
|
||||
{'name':['head','style','script']},
|
||||
{'id':['header']},
|
||||
{'class':re.compile(r'\bside|\bad\b|floatright|tags')}
|
||||
]
|
||||
remove_tags_before = {'class':'wrapper'}
|
||||
remove_tags_after = [{'id':'content'}]
|
||||
|
||||
preprocess_regexps = [
|
||||
# <header> tags with an img inside are just blog banners, don't need them
|
||||
# note: there are other useful <header> tags so we don't want to just strip all of them
|
||||
(re.compile(r'<header class.+?<img .+?>.+?</header>', re.DOTALL|re.IGNORECASE),lambda m: ''),
|
||||
# delete everything between the *last* <hr class="small" /> and </article>
|
||||
(re.compile(r'<hr class="small"(?:(?!<hr class="small").)+</article>', re.DOTALL|re.IGNORECASE),lambda m: '<hr class="small" /></article>'),
|
||||
]
|
||||
extra_css = """cite, time { font-size: 0.8em !important; margin-right: 1em !important; }
|
||||
img + cite { display:block; text-align:right}"""
|
||||
|
||||
def parse_index(self):
|
||||
feeds = []
|
||||
seen_urls = set([])
|
||||
|
||||
for category in self.CATEGORIES:
|
||||
|
||||
(cat_name, tag, max_articles) = category
|
||||
self.log('Reading category:', cat_name)
|
||||
articles = []
|
||||
|
||||
page = "%s/%s" % (self.INDEX, tag)
|
||||
soup = self.index_to_soup(page)
|
||||
headers = soup.findAll('h2' if tag=='' else 'h3')
|
||||
|
||||
for header in headers:
|
||||
tag = header.find('a',href=True)
|
||||
if tag is None:
|
||||
continue
|
||||
url = tag['href']
|
||||
if url in seen_urls:
|
||||
continue
|
||||
title = self.tag_to_string(tag)
|
||||
if 'Podcast:' in title or 'In Case You Missed It' in title:
|
||||
continue
|
||||
desc = dt = ''
|
||||
# get at the div that contains description and other info
|
||||
div = header.parent.find('div')
|
||||
if div is not None:
|
||||
desc = self.tag_to_string(div)
|
||||
dt = div.find('time')
|
||||
if dt is not None:
|
||||
dt = self.tag_to_string( dt)
|
||||
|
||||
# if div contains the same url that is in h2/h3
|
||||
# that means this is a series split into multiple articles
|
||||
if div.find('a',href=url):
|
||||
self.log('\tFound series:', title)
|
||||
# grab all articles in series
|
||||
for tag in div.findAll('a',href=True):
|
||||
url = tag['href']
|
||||
if url in seen_urls:
|
||||
continue
|
||||
self.log('\t', url)
|
||||
seen_urls.add(url)
|
||||
articles.append({'title':title+' - '+self.tag_to_string( tag),
|
||||
'url':url,'description':desc,'date':dt})
|
||||
else:
|
||||
self.log('\tFound article:', title)
|
||||
self.log('\t', url)
|
||||
seen_urls.add(url)
|
||||
articles.append({'title':title,'url':url,'description':desc,'date':dt})
|
||||
|
||||
if len(articles) >= max_articles:
|
||||
break
|
||||
|
||||
if articles:
|
||||
feeds.append((cat_name, articles))
|
||||
|
||||
return feeds
|
||||
|
||||
def print_version(self, url):
|
||||
return url+'?view=print'
|
11
recipes/haksoz.recipe
Normal file
11
recipes/haksoz.recipe
Normal file
@ -0,0 +1,11 @@
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class BasicUserRecipe1324739199(BasicNewsRecipe):
|
||||
title = u'Haks\xf6z'
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 20
|
||||
auto_cleanup = True
|
||||
language = 'tr'
|
||||
__author__ = 'asalet_r'
|
||||
|
||||
feeds = [(u'Haks\xf6z', u'http://www.haksozhaber.net/rss/')]
|
58
recipes/hamilton_spectator.recipe
Normal file
58
recipes/hamilton_spectator.recipe
Normal file
@ -0,0 +1,58 @@
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
'''
|
||||
Hamilton Spectator Calibre Recipe
|
||||
'''
|
||||
class HamiltonSpectator(BasicNewsRecipe):
|
||||
title = u'Hamilton Spectator'
|
||||
oldest_article = 2
|
||||
max_articles_per_feed = 100
|
||||
auto_cleanup = True
|
||||
__author__ = u'Eric Coolman'
|
||||
publisher = u'thespec.com'
|
||||
description = u'Ontario Canada Newspaper'
|
||||
category = u'News, Ontario, Canada'
|
||||
remove_javascript = True
|
||||
use_embedded_content = False
|
||||
no_stylesheets = True
|
||||
language = 'en_CA'
|
||||
encoding = 'utf-8'
|
||||
|
||||
feeds = [
|
||||
(u'Top Stories',u'http://www.thespec.com/rss?query=/&assetType=Article'),
|
||||
(u'All News',u'http://www.thespec.com/rss?query=/news&assetType=Article'),
|
||||
(u'Local',u'http://www.thespec.com/rss?query=/local&assetType=Article'),
|
||||
(u'Ontario',u'http://www.thespec.com/rss?query=/ontario&assetType=Article'),
|
||||
(u'Canada',u'http://www.thespec.com/rss?query=/canada&assetType=Article'),
|
||||
(u'World News',u'http://www.thespec.com/rss?query=/world&assetType=Article'),
|
||||
(u'Business',u'http://www.thespec.com/rss?query=/business&assetType=Article'),
|
||||
(u'Crime',u'http://www.thespec.com/rss?query=/crime&assetType=Article'),
|
||||
(u'All Sports',u'http://www.thespec.com/rss?query=/sports&assetType=Article'),
|
||||
(u'Ticats',u'http://www.thespec.com/rss?query=/sports/ticats&assetType=Article'),
|
||||
(u'Bulldogs',u'http://www.thespec.com/rss?query=/sports/bulldogs&assetType=Article'),
|
||||
(u'High School Sports',u'http://www.thespec.com/rss?query=/sports/highschools&assetType=Article'),
|
||||
(u'Local Sports',u'http://www.thespec.com/rss?query=/sports/local&assetType=Article'),
|
||||
(u'What''s On',u'http://www.thespec.com/rss?query=/whatson&assetType=Article'),
|
||||
(u'Arts and Entertainment',u'http://www.thespec.com/rss?query=/whatson/artsentertainment&assetType=Article'),
|
||||
(u'Books',u'http://www.thespec.com/rss?query=/whatson/books&assetType=Article'),
|
||||
(u'Movies',u'http://www.thespec.com/rss?query=/whatson/movies&assetType=Article'),
|
||||
(u'Music',u'http://www.thespec.com/rss?query=/whatson/music&assetType=Article'),
|
||||
(u'Restaurant Reviews',u'http://www.thespec.com/rss?query=/whatson/restaurants&assetType=Article'),
|
||||
(u'Opinion',u'http://www.thespec.com/rss?query=/opinion&assetType=Article'),
|
||||
(u'Opinion Columns',u'http://www.thespec.com/rss?query=/opinion/columns&assetType=Article'),
|
||||
(u'Cartoons',u'http://www.thespec.com/rss?query=/opinion/cartoons&assetType=Article'),
|
||||
(u'Letters',u'http://www.thespec.com/rss?query=/opinion/letters&assetType=Article'),
|
||||
(u'Editorial',u'http://www.thespec.com/rss?query=/opinion/editorial&assetType=Article'),
|
||||
(u'Community',u'http://www.thespec.com/rss?query=/community&assetType=Article'),
|
||||
(u'Education',u'http://www.thespec.com/rss?query=/community/education&assetType=Article'),
|
||||
(u'Faith',u'http://www.thespec.com/rss?query=/community/faith&assetType=Article'),
|
||||
(u'Contests',u'http://www.thespec.com/rss?query=/community/contests&assetType=Article'),
|
||||
(u'Living',u'http://www.thespec.com/rss?query=/living&assetType=Article'),
|
||||
(u'Food',u'http://www.thespec.com/rss?query=/living/food&assetType=Article'),
|
||||
(u'Health and Fitness',u'http://www.thespec.com/rss?query=/living/healthfitness&assetType=Article'),
|
||||
(u'Your Home',u'http://www.thespec.com/rss?query=/living/home&assetType=Article'),
|
||||
(u'Travel',u'http://www.thespec.com/rss?query=/living/travel&assetType=Article'),
|
||||
(u'Family and Parenting',u'http://www.thespec.com/rss?query=/living/familyparenting&assetType=Article'),
|
||||
(u'Style',u'http://www.thespec.com/rss?query=/living/style&assetType=Article')
|
||||
]
|
||||
|
@ -1,4 +1,5 @@
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
import urllib, re
|
||||
|
||||
class HindustanTimes(BasicNewsRecipe):
|
||||
title = u'Hindustan Times'
|
||||
@ -26,4 +27,24 @@ class HindustanTimes(BasicNewsRecipe):
|
||||
'http://feeds.hindustantimes.com/HT-Homepage-LifestyleNews'),
|
||||
]
|
||||
|
||||
def get_article_url(self, article):
|
||||
'''
|
||||
HT uses a variant of the feedportal RSS ad display mechanism
|
||||
'''
|
||||
try:
|
||||
s = article.summary
|
||||
return urllib.unquote(
|
||||
re.search(r'href=".+?bookmark.cfm.+?link=(.+?)"', s).group(1))
|
||||
except:
|
||||
pass
|
||||
url = BasicNewsRecipe.get_article_url(self, article)
|
||||
res = self.browser.open_novisit(url)
|
||||
url = res.geturl().split('/')[-2]
|
||||
encoding = {'0B': '.', '0C': '/', '0A': '0', '0F': '=', '0G': '&',
|
||||
'0D': '?', '0E': '-', '0N': '.com', '0L': 'http://', '0S':
|
||||
'www.'}
|
||||
for k, v in encoding.iteritems():
|
||||
url = url.replace(k, v)
|
||||
return url
|
||||
|
||||
|
||||
|
@ -1,44 +1,58 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
import re
|
||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||
################################################################################
|
||||
#Description: http://hvg.hu/ RSS channel
|
||||
#Author: Bigpapa (bigpapabig@hotmail.com)
|
||||
#Date: 2011.12.20. - V1.1
|
||||
################################################################################
|
||||
|
||||
class HVG(BasicNewsRecipe):
|
||||
title = 'HVG.HU'
|
||||
__author__ = u'István Papp'
|
||||
description = u'Friss hírek a HVG-től'
|
||||
timefmt = ' [%Y. %b. %d., %a.]'
|
||||
oldest_article = 4
|
||||
language = 'hu'
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
max_articles_per_feed = 100
|
||||
no_stylesheets = True
|
||||
use_embedded_content = False
|
||||
encoding = 'utf8'
|
||||
publisher = 'HVG Online'
|
||||
category = u'news, hírek, hvg'
|
||||
extra_css = 'body{ font-family: Verdana,Helvetica,Arial,sans-serif } .introduction{font-weight: bold} .story-feature{display: block; padding: 0; border: 1px solid; width: 40%; font-size: small} .story-feature h2{text-align: center; text-transform: uppercase} '
|
||||
preprocess_regexps = [(re.compile(r'<!--.*?-->', re.DOTALL), lambda m: '')]
|
||||
remove_tags_before = dict(id='pg-content')
|
||||
remove_javascript = True
|
||||
remove_empty_feeds = True
|
||||
class hvg(BasicNewsRecipe):
|
||||
title = u'HVG'
|
||||
__author__ = 'Bigpapa'
|
||||
language = 'hu'
|
||||
oldest_article = 5 # Hany napos legyen a legregebbi cikk amit leszedjen.
|
||||
max_articles_per_feed = 5 # Az adott e-bookban tarolt cikkek feedenkenti maximalis szamat adja meg.
|
||||
no_stylesheets = True
|
||||
encoding = 'utf8'
|
||||
extra_css = ' h2 { font:bold 28px} '
|
||||
|
||||
feeds = [
|
||||
(u'Itthon', u'http://hvg.hu/rss/itthon')
|
||||
,(u'Világ', u'http://hvg.hu/rss/vilag')
|
||||
,(u'Gazdaság', u'http://hvg.hu/rss/gazdasag')
|
||||
,(u'IT | Tudomány', u'http://hvg.hu/rss/tudomany')
|
||||
,(u'Panoráma', u'http://hvg.hu/rss/Panorama')
|
||||
,(u'Karrier', u'http://hvg.hu/rss/karrier')
|
||||
,(u'Gasztronómia', u'http://hvg.hu/rss/gasztronomia')
|
||||
,(u'Helyi érték', u'http://hvg.hu/rss/helyiertek')
|
||||
,(u'Kultúra', u'http://hvg.hu/rss/kultura')
|
||||
,(u'Cégautó', u'http://hvg.hu/rss/cegauto')
|
||||
,(u'Vállalkozó szellem', u'http://hvg.hu/rss/kkv')
|
||||
,(u'Egészség', u'http://hvg.hu/rss/egeszseg')
|
||||
,(u'Vélemény', u'http://hvg.hu/rss/velemeny')
|
||||
,(u'Sport', u'http://hvg.hu/rss/sport')
|
||||
]
|
||||
remove_attributes = ['style','font', 'href']
|
||||
|
||||
def print_version(self, url):
|
||||
return url.replace ('#rss', '/print')
|
||||
keep_only_tags = [
|
||||
dict(name='div', attrs={'id':['pg-content']})
|
||||
]
|
||||
|
||||
remove_tags = [
|
||||
dict(name='div', attrs={'class':['box articlemenu', 'bannergoogle468', 'boxcontainer left', 'boxcontainer', 'commentbox']}),
|
||||
dict(name='table', attrs={'class':['banner2', 'monocle']}),
|
||||
dict(name='div', attrs={'id':['connect_widget_4cf63ca849ddf4577922632', 'sharetip', 'upprev_box']}),
|
||||
dict(name='div', attrs={'style':['float: right; margin-bottom: 5px;', 'display: none;']}),
|
||||
dict(name='h3', attrs={'class':['hthree']}),
|
||||
dict(name='ul', attrs={'class':['defaultul']}),
|
||||
dict(name='form', attrs={'id':['commentForm']}),
|
||||
dict(name='h6', attrs={'class':['hthree']}),
|
||||
dict(name='h6', attrs={'class':['more2']}),
|
||||
dict(name='img', attrs={'class':['framed']}),
|
||||
dict(name='td', attrs={'class':['greyboxbody','embedvideobody','embedvideofooter','embedvideobottom']}),
|
||||
|
||||
|
||||
|
||||
]
|
||||
|
||||
feeds = [
|
||||
# (u'\xd6sszes', 'http://hvg.hu/rss'),
|
||||
(u'Itthon', 'http://hvg.hu/rss/itthon'),
|
||||
(u'Vil\xe1g', 'http://hvg.hu/rss/vilag'),
|
||||
(u'Gazdas\xe1g', 'http://hvg.hu/rss/gazdasag'),
|
||||
(u'Tudom\xe1ny', 'http://hvg.hu/rss/tudomany'),
|
||||
(u'Panor\xe1ma', 'http://hvg.hu/rss/panorama'),
|
||||
(u'Karrier', 'http://hvg.hu/rss/karrier'),
|
||||
(u'Gasztron\xf3mia', 'http://hvg.hu/rss/gasztronomia'),
|
||||
(u'Helyi \xe9rt\xe9k', 'http://hvg.hu/rss/helyiertek'),
|
||||
(u'Kult\xfara', 'http://hvg.hu/rss/kultura'),
|
||||
(u'C\xe9gaut\xf3', 'http://hvg.hu/rss/cegauto'),
|
||||
(u'V\xe1llalkoz\xf3 szellem', 'http://hvg.hu/rss/kkv'),
|
||||
(u'Eg\xe9szs\xe9g', 'http://hvg.hu/rss/egeszseg'),
|
||||
(u'V\xe9lem\xe9ny', 'http://hvg.hu/rss/velemeny'),
|
||||
(u'Sport', 'http://hvg.hu/rss/sport')
|
||||
]
|
BIN
recipes/icons/mlody_technik_pl.png
Normal file
BIN
recipes/icons/mlody_technik_pl.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 2.1 KiB |
Binary file not shown.
Before Width: | Height: | Size: 15 KiB |
BIN
recipes/icons/moneynews.png
Normal file
BIN
recipes/icons/moneynews.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 914 B |
BIN
recipes/icons/novilist_novine_hr.png
Normal file
BIN
recipes/icons/novilist_novine_hr.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 241 B |
BIN
recipes/icons/novilist_portal_hr.png
Normal file
BIN
recipes/icons/novilist_portal_hr.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 944 B |
BIN
recipes/icons/rionegro.png
Normal file
BIN
recipes/icons/rionegro.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 817 B |
68
recipes/ideal_almeria.recipe
Normal file
68
recipes/ideal_almeria.recipe
Normal file
@ -0,0 +1,68 @@
|
||||
# encoding: utf-8 -*-
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__author__ = 'Josemi Liébana <office at josemi-liebana.com>'
|
||||
__copyright__ = 'Josemi Liébana'
|
||||
__version__ = 'v0.1'
|
||||
__date__ = '5 January 2012'
|
||||
|
||||
|
||||
'''
|
||||
www.ideal.es
|
||||
'''
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class Ideal(BasicNewsRecipe):
|
||||
title = u'Ideal (Edición Almería)'
|
||||
__author__ = u'Josemi Liébana'
|
||||
description = u'Noticias de Almería y el resto del mundo'
|
||||
publisher = 'Ideal'
|
||||
category = u'News, Politics, Spain, Almería'
|
||||
publication_type = 'Newspaper'
|
||||
oldest_article = 2
|
||||
max_articles_per_feed = 200
|
||||
no_stylesheets = True
|
||||
encoding = 'cp1252'
|
||||
use_embedded_content = False
|
||||
language = 'es'
|
||||
remove_empty_feeds = True
|
||||
masthead_url = u'http://www.ideal.es/img/rd.logotipo2_ideal.gif'
|
||||
cover_url = u'http://www.ideal.es/granada/noticias/201112/24/Media/Granada/portada--647x894.JPG'
|
||||
extra_css = u' body{font-family: Arial,Helvetica,sans-serif } img{margin-bottom: 0.4em} .photo-caption{font-size: x-small} '
|
||||
|
||||
conversion_options = {
|
||||
'comment' : description
|
||||
, 'tags' : category
|
||||
, 'publisher' : publisher
|
||||
, 'language' : language
|
||||
}
|
||||
|
||||
keep_only_tags = [
|
||||
dict(attrs={'id':'title'})
|
||||
,dict(attrs={'class':['overhead','headline','subhead','date','text','noticia_cont','desarrollo']})
|
||||
]
|
||||
|
||||
remove_tags = [dict(name='ul')]
|
||||
|
||||
remove_attributes = ['width','height']
|
||||
|
||||
feeds = [
|
||||
(u'Última Hora' , u'http://www.ideal.es/almeria/rss/feeds/ultima.xml' )
|
||||
,(u'Portada' , u'http://www.ideal.es/almeria/portada.xml' )
|
||||
,(u'Local' , u'http://www.ideal.es/almeria/rss/feeds/granada.xml' )
|
||||
,(u'Deportes' , u'http://www.ideal.es/almeria/rss/feeds/deportes.xml' )
|
||||
,(u'Sociedad' , u'http://www.ideal.es/almeria/rss/feeds/sociedad.xml' )
|
||||
,(u'Cultura' , u'http://www.ideal.es/almeria/rss/feeds/cultura.xml' )
|
||||
,(u'Economía' , u'http://www.ideal.es/almeria/rss/feeds/economia.xml' )
|
||||
,(u'Costa' , u'http://www.ideal.es/almeria/rss/feeds/costa.xml' )
|
||||
,(u'Puerta Purchena' , u'http://www.ideal.es/almeria/rss/feeds/puerta_purchena.xml' )
|
||||
,(u'Andalucía' , u'http://www.ideal.es/almeria/rss/feeds/andalucia.xml' )
|
||||
,(u'España' , u'http://www.ideal.es/almeria/rss/feeds/espana.xml' )
|
||||
,(u'Mundo' , u'http://www.ideal.es/almeria/rss/feeds/internacional.xml' )
|
||||
,(u'Vivir' , u'http://www.ideal.es/almeria/rss/feeds/vivir.xml' )
|
||||
,(u'Opinión' , u'http://www.ideal.es/almeria/rss/feeds/opinion.xml' )
|
||||
,(u'Televisión' , u'http://www.ideal.es/almeria/rss/feeds/television.xml' )
|
||||
,(u'Contraportada' , u'http://www.ideal.es/almeria/rss/feeds/contraportada.xml' )
|
||||
]
|
||||
|
69
recipes/ideal_granada.recipe
Normal file
69
recipes/ideal_granada.recipe
Normal file
@ -0,0 +1,69 @@
|
||||
# encoding: utf-8 -*-
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__author__ = 'Josemi Liébana <office at josemi-liebana.com>'
|
||||
__copyright__ = 'Josemi Liébana'
|
||||
__version__ = 'v0.1'
|
||||
__date__ = '5 January 2012'
|
||||
|
||||
|
||||
'''
|
||||
www.ideal.es
|
||||
'''
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class Ideal(BasicNewsRecipe):
|
||||
title = u'Ideal (Edición Granada)'
|
||||
__author__ = u'Josemi Liébana'
|
||||
description = u'Noticias de Granada y el resto del mundo'
|
||||
publisher = 'Ideal'
|
||||
category = 'News, Politics, Spain, Granada'
|
||||
publication_type = 'Newspaper'
|
||||
oldest_article = 2
|
||||
max_articles_per_feed = 200
|
||||
no_stylesheets = True
|
||||
encoding = 'cp1252'
|
||||
use_embedded_content = False
|
||||
language = 'es'
|
||||
remove_empty_feeds = True
|
||||
masthead_url = 'http://www.ideal.es/img/rd.logotipo2_ideal.gif'
|
||||
cover_url = 'http://www.ideal.es/granada/noticias/201112/24/Media/Granada/portada--647x894.JPG'
|
||||
extra_css = ' body{font-family: Arial,Helvetica,sans-serif } img{margin-bottom: 0.4em} .photo-caption{font-size: x-small} '
|
||||
|
||||
conversion_options = {
|
||||
'comment' : description
|
||||
, 'tags' : category
|
||||
, 'publisher' : publisher
|
||||
, 'language' : language
|
||||
}
|
||||
|
||||
keep_only_tags = [
|
||||
dict(attrs={'id':'title'})
|
||||
,dict(attrs={'class':['overhead','headline','subhead','date','text','noticia_cont','desarrollo']})
|
||||
]
|
||||
|
||||
remove_tags = [dict(name='ul')]
|
||||
|
||||
remove_attributes = ['width','height']
|
||||
|
||||
feeds = [
|
||||
(u'Última Hora' , u'http://www.ideal.es/granada/rss/feeds/ultima.xml' )
|
||||
,(u'Portada' , u'http://www.ideal.es/granada/portada.xml' )
|
||||
,(u'Local' , u'http://www.ideal.es/granada/rss/feeds/granada.xml' )
|
||||
,(u'Deportes' , u'http://www.ideal.es/granada/rss/feeds/deportes.xml' )
|
||||
,(u'Sociedad' , u'http://www.ideal.es/granada/rss/feeds/sociedad.xml' )
|
||||
,(u'Cultura' , u'http://www.ideal.es/granada/rss/feeds/cultura.xml' )
|
||||
,(u'Economía' , u'http://www.ideal.es/granada/rss/feeds/economia.xml' )
|
||||
,(u'Costa' , u'http://www.ideal.es/granada/rss/feeds/costa.xml' )
|
||||
,(u'La Carrera' , u'http://www.ideal.es/granada/rss/feeds/la_carrera.xml' )
|
||||
,(u'Puerta Real' , u'http://www.ideal.es/granada/rss/feeds/puerta_real.xml' )
|
||||
,(u'Andalucía' , u'http://www.ideal.es/granada/rss/feeds/andalucia.xml' )
|
||||
,(u'España' , u'http://www.ideal.es/granada/rss/feeds/espana.xml' )
|
||||
,(u'Mundo' , u'http://www.ideal.es/granada/rss/feeds/internacional.xml' )
|
||||
,(u'Vivir' , u'http://www.ideal.es/granada/rss/feeds/vivir.xml' )
|
||||
,(u'Opinión' , u'http://www.ideal.es/granada/rss/feeds/opinion.xml' )
|
||||
,(u'Televisión' , u'http://www.ideal.es/granada/rss/feeds/television.xml' )
|
||||
,(u'Contraportada' , u'http://www.ideal.es/granada/rss/feeds/contraportada.xml' )
|
||||
]
|
||||
|
67
recipes/ideal_jaen.recipe
Normal file
67
recipes/ideal_jaen.recipe
Normal file
@ -0,0 +1,67 @@
|
||||
# encoding: utf-8 -*-
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__author__ = 'Josemi Liébana <office at josemi-liebana.com>'
|
||||
__copyright__ = 'Josemi Liébana'
|
||||
__version__ = 'v0.1'
|
||||
__date__ = '5 January 2012'
|
||||
|
||||
|
||||
'''
|
||||
www.ideal.es
|
||||
'''
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class Ideal(BasicNewsRecipe):
|
||||
title = u'Ideal (Edición Jaén)'
|
||||
__author__ = u'Josemi Liébana'
|
||||
description = u'Noticias de Jaén y el resto del mundo'
|
||||
publisher = 'Ideal'
|
||||
category = u'News, Politics, Spain, Jaén'
|
||||
publication_type = 'Newspaper'
|
||||
oldest_article = 2
|
||||
max_articles_per_feed = 200
|
||||
no_stylesheets = True
|
||||
encoding = 'cp1252'
|
||||
use_embedded_content = False
|
||||
language = 'es'
|
||||
remove_empty_feeds = True
|
||||
masthead_url = 'http://www.ideal.es/img/rd.logotipo2_ideal.gif'
|
||||
cover_url = 'http://www.ideal.es/granada/noticias/201112/24/Media/Granada/portada--647x894.JPG'
|
||||
extra_css = ' body{font-family: Arial,Helvetica,sans-serif } img{margin-bottom: 0.4em} .photo-caption{font-size: x-small} '
|
||||
|
||||
conversion_options = {
|
||||
'comment' : description
|
||||
, 'tags' : category
|
||||
, 'publisher' : publisher
|
||||
, 'language' : language
|
||||
}
|
||||
|
||||
keep_only_tags = [
|
||||
dict(attrs={'id':'title'})
|
||||
,dict(attrs={'class':['overhead','headline','subhead','date','text','noticia_cont','desarrollo']})
|
||||
]
|
||||
|
||||
remove_tags = [dict(name='ul')]
|
||||
|
||||
remove_attributes = ['width','height']
|
||||
|
||||
feeds = [
|
||||
(u'Última Hora' , u'http://www.ideal.es/jaen/rss/feeds/ultima.xml' )
|
||||
,(u'Portada' , u'http://www.ideal.es/jaen/portada.xml' )
|
||||
,(u'Local' , u'http://www.ideal.es/jaen/rss/feeds/granada.xml' )
|
||||
,(u'Deportes' , u'http://www.ideal.es/jaen/rss/feeds/deportes.xml' )
|
||||
,(u'Sociedad' , u'http://www.ideal.es/jaen/rss/feeds/sociedad.xml' )
|
||||
,(u'Cultura' , u'http://www.ideal.es/jaen/rss/feeds/cultura.xml' )
|
||||
,(u'Economía' , u'http://www.ideal.es/jaen/rss/feeds/economia.xml' )
|
||||
,(u'Costa' , u'http://www.ideal.es/jaen/rss/feeds/costa.xml' )
|
||||
,(u'Andalucía' , u'http://www.ideal.es/jaen/rss/feeds/andalucia.xml' )
|
||||
,(u'España' , u'http://www.ideal.es/jaen/rss/feeds/espana.xml' )
|
||||
,(u'Mundo' , u'http://www.ideal.es/jaen/rss/feeds/internacional.xml' )
|
||||
,(u'Vivir' , u'http://www.ideal.es/jaen/rss/feeds/vivir.xml' )
|
||||
,(u'Opinión' , u'http://www.ideal.es/jaen/rss/feeds/opinion.xml' )
|
||||
,(u'Televisión' , u'http://www.ideal.es/jaen/rss/feeds/television.xml' )
|
||||
,(u'Contraportada' , u'http://www.ideal.es/jaen/rss/feeds/contraportada.xml' )
|
||||
]
|
||||
|
@ -1,63 +1,30 @@
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2008, Derry FitzGerald'
|
||||
'''
|
||||
iht.com
|
||||
'''
|
||||
import re
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
from calibre.ptempfile import PersistentTemporaryFile
|
||||
|
||||
class NYTimesGlobal(BasicNewsRecipe):
|
||||
title = u'NY Times Global'
|
||||
language = 'en'
|
||||
__author__ = 'Krittika Goyal'
|
||||
oldest_article = 1 #days
|
||||
max_articles_per_feed = 25
|
||||
use_embedded_content = False
|
||||
|
||||
class InternationalHeraldTribune(BasicNewsRecipe):
|
||||
title = u'The International Herald Tribune'
|
||||
__author__ = 'Derry FitzGerald'
|
||||
language = 'en'
|
||||
|
||||
oldest_article = 1
|
||||
max_articles_per_feed = 30
|
||||
no_stylesheets = True
|
||||
auto_cleanup = True
|
||||
|
||||
remove_tags = [dict(name='div', attrs={'class':['footer','header']}),
|
||||
dict(name=['form'])]
|
||||
preprocess_regexps = [
|
||||
(re.compile(r'<!-- webtrends.*', re.DOTALL),
|
||||
lambda m:'</body></html>')
|
||||
]
|
||||
extra_css = '.headline {font-size: x-large;} \n .fact { padding-top: 10pt }'
|
||||
|
||||
remove_empty_feeds = True
|
||||
|
||||
feeds = [
|
||||
(u'Frontpage', u'http://www.iht.com/rss/frontpage.xml'),
|
||||
(u'Business', u'http://www.iht.com/rss/business.xml'),
|
||||
(u'Americas', u'http://www.iht.com/rss/america.xml'),
|
||||
(u'Europe', u'http://www.iht.com/rss/europe.xml'),
|
||||
(u'Asia', u'http://www.iht.com/rss/asia.xml'),
|
||||
(u'Africa and Middle East', u'http://www.iht.com/rss/africa.xml'),
|
||||
(u'Opinion', u'http://www.iht.com/rss/opinion.xml'),
|
||||
(u'Technology', u'http://www.iht.com/rss/technology.xml'),
|
||||
(u'Health and Science', u'http://www.iht.com/rss/healthscience.xml'),
|
||||
(u'Sports', u'http://www.iht.com/rss/sports.xml'),
|
||||
(u'Culture', u'http://www.iht.com/rss/arts.xml'),
|
||||
(u'Style and Design', u'http://www.iht.com/rss/style.xml'),
|
||||
(u'Travel', u'http://www.iht.com/rss/travel.xml'),
|
||||
(u'At Home Abroad', u'http://www.iht.com/rss/athome.xml'),
|
||||
(u'Your Money', u'http://www.iht.com/rss/yourmoney.xml'),
|
||||
(u'Properties', u'http://www.iht.com/rss/properties.xml')
|
||||
]
|
||||
temp_files = []
|
||||
articles_are_obfuscated = True
|
||||
|
||||
masthead_url = 'http://graphics8.nytimes.com/images/misc/iht-masthead-logo.gif'
|
||||
|
||||
def get_obfuscated_article(self, url):
|
||||
br = self.get_browser()
|
||||
br.open(url)
|
||||
response1 = br.follow_link(url_regex=re.compile(r'.*pagewanted=print.*'))
|
||||
html = response1.read()
|
||||
|
||||
self.temp_files.append(PersistentTemporaryFile('_iht.html'))
|
||||
self.temp_files[-1].write(html)
|
||||
self.temp_files[-1].close()
|
||||
return self.temp_files[-1].name
|
||||
('NYTimes',
|
||||
'http://www.nytimes.com/services/xml/rss/nyt/HomePage.xml'),
|
||||
('NYTimes global',
|
||||
'http://www.nytimes.com/services/xml/rss/nyt/GlobalHome.xml'),
|
||||
('World',
|
||||
'http://www.nytimes.com/services/xml/rss/nyt/World.xml'),
|
||||
('U.S.',
|
||||
'http://www.nytimes.com/services/xml/rss/nyt/US.xml'),
|
||||
('Business',
|
||||
'http://feeds.nytimes.com/nyt/rss/Business'),
|
||||
('Sports',
|
||||
'http://www.nytimes.com/services/xml/rss/nyt/Sports.xml'),
|
||||
('Technology',
|
||||
'http://feeds.nytimes.com/nyt/rss/Technology'),
|
||||
]
|
||||
|
12
recipes/iktibas.recipe
Normal file
12
recipes/iktibas.recipe
Normal file
@ -0,0 +1,12 @@
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class BasicUserRecipe1324739406(BasicNewsRecipe):
|
||||
title = u'\u0130ktibas'
|
||||
language = 'tr'
|
||||
__author__ = 'asalet_r'
|
||||
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 20
|
||||
auto_cleanup = True
|
||||
|
||||
feeds = [(u'\u0130ktibas', u'http://www.iktibasdergisi.com/rss/rss.xml')]
|
@ -1,16 +1,20 @@
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||
|
||||
class AdvancedUserRecipe1234144423(BasicNewsRecipe):
|
||||
title = u'Indianapolis Star'
|
||||
oldest_article = 5
|
||||
language = 'en'
|
||||
class IndianapolisStar(BasicNewsRecipe):
|
||||
title = u'Indianapolis Star'
|
||||
oldest_article = 10
|
||||
auto_cleanup = True
|
||||
language = 'en'
|
||||
__author__ = 'Owen Kelly'
|
||||
max_articles_per_feed = 100
|
||||
cover_url = u'http://www2.indystar.com/frontpage/images/today.jpg'
|
||||
feeds = [(u'Community Headlines', u'http://www.indystar.com/apps/pbcs.dll/section?Category=LOCAL&template=rss'),
|
||||
(u'News Headlines', u'http://www.indystar.com/apps/pbcs.dll/section?Category=NEWS&template=rss'),
|
||||
(u'Business Headlines', u'http://www.indystar.com/apps/pbcs.dll/section?Category=BUSINESS&template=rss'),
|
||||
(u'Politics and Government', u'http://www.indystar.com/apps/pbcs.dll/section?Category=NEWS05&template=rss'),
|
||||
(u'Lifestyle Headlines', u'http://www.indystar.com/apps/pbcs.dll/section?Category=LIVING&template=rss&mime=XML'),
|
||||
(u'Opinion Headlines', u'http://www.indystar.com/apps/pbcs.dll/section?Category=OPINION&template=rss&mime=XML')
|
||||
]
|
||||
|
||||
__author__ = 'Owen Kelly'
|
||||
max_articles_per_feed = 100
|
||||
|
||||
cover_url = u'http://www2.indystar.com/frontpage/images/today.jpg'
|
||||
|
||||
feeds = [(u'Community Headlines', u'http://www.indystar.com/apps/pbcs.dll/section?Category=LOCAL&template=rss&mime=XML'), (u'News Headlines', u'http://www.indystar.com/apps/pbcs.dll/section?Category=NEWS&template=rss&mime=XML'), (u'Business Headlines', u'http://www..indystar.com/apps/pbcs.dll/section?Category=BUSINESS&template=rss&mime=XML'), (u'Sports Headlines', u'http://www.indystar.com/apps/pbcs.dll/section?Category=SPORTS&template=rss&mime=XML'), (u'Lifestyle Headlines', u'http://www.indystar.com/apps/pbcs.dll/section?Category=LIVING&template=rss&mime=XML'), (u'Opinion Headlines', u'http://www.indystar.com/apps/pbcs.dll/section?Category=OPINION&template=rss&mime=XML')]
|
||||
|
||||
def print_version(self, url):
|
||||
return url + '&template=printart'
|
||||
def print_version(self, url):
|
||||
return url + '&template=printart'
|
||||
|
12
recipes/izdiham.com.recipe
Normal file
12
recipes/izdiham.com.recipe
Normal file
@ -0,0 +1,12 @@
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class BasicUserRecipe1324158549(BasicNewsRecipe):
|
||||
title = u'izdiham.com'
|
||||
language = 'tr'
|
||||
__author__ = 'asalet_r'
|
||||
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 20
|
||||
auto_cleanup = True
|
||||
|
||||
feeds = [(u'\u0130zdiham', u'http://www.izdiham.com/index.php/feed')]
|
@ -1,79 +1,79 @@
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2011, Attis <attis@attis.one.pl>'
|
||||
__copyright__ = '2011 Attis <attis@attis.one.pl>, 2012 Tomasz Długosz <tomek3d@gmail.com>'
|
||||
__version__ = 'v. 0.1'
|
||||
|
||||
import re
|
||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||
|
||||
class KopalniaWiedzy(BasicNewsRecipe):
|
||||
title = u'Kopalnia Wiedzy'
|
||||
publisher = u'Kopalnia Wiedzy'
|
||||
description = u'Ciekawostki ze świata nauki i techniki'
|
||||
encoding = 'utf-8'
|
||||
__author__ = 'Attis'
|
||||
language = 'pl'
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 100
|
||||
INDEX = u'http://kopalniawiedzy.pl/'
|
||||
remove_javascript = True
|
||||
no_stylesheets = True
|
||||
|
||||
remove_tags = [{'name':'p', 'attrs': {'class': 'keywords'} }, {'name':'div', 'attrs': {'class':'sexy-bookmarks sexy-bookmarks-bg-caring'}}]
|
||||
remove_tags_after = dict(attrs={'class':'ad-square'})
|
||||
keep_only_tags = [dict(name="div", attrs={'id':'articleContent'})]
|
||||
extra_css = '.topimage {margin-top: 30px}'
|
||||
|
||||
preprocess_regexps = [
|
||||
(re.compile(u'<a .* rel="lightboxText" .*><img (.*)></a>'),
|
||||
lambda match: '<img class="topimage" ' + match.group(1) + '>' ),
|
||||
(re.compile(u'<br /><br />'),
|
||||
lambda match: '<br\/>')
|
||||
]
|
||||
|
||||
feeds = [
|
||||
(u'Biologia', u'http://kopalniawiedzy.pl/wiadomosci_biologia.rss'),
|
||||
(u'Medycyna', u'http://kopalniawiedzy.pl/wiadomosci_medycyna.rss'),
|
||||
(u'Psychologia', u'http://kopalniawiedzy.pl/wiadomosci_psychologia.rss'),
|
||||
(u'Technologie', u'http://kopalniawiedzy.pl/wiadomosci_technologie.rss'),
|
||||
(u'Ciekawostki', u'http://kopalniawiedzy.pl/wiadomosci_ciekawostki.rss'),
|
||||
(u'Artykuły', u'http://kopalniawiedzy.pl/artykuly.rss')
|
||||
]
|
||||
|
||||
def is_link_wanted(self, url, tag):
|
||||
return tag['class'] == 'next'
|
||||
|
||||
def remove_beyond(self, tag, next):
|
||||
while tag is not None and getattr(tag, 'name', None) != 'body':
|
||||
after = getattr(tag, next)
|
||||
while after is not None:
|
||||
ns = getattr(tag, next)
|
||||
after.extract()
|
||||
after = ns
|
||||
tag = tag.parent
|
||||
|
||||
def append_page(self, soup, appendtag, position):
|
||||
pager = soup.find('a',attrs={'class':'next'})
|
||||
if pager:
|
||||
nexturl = self.INDEX + pager['href']
|
||||
soup2 = self.index_to_soup(nexturl)
|
||||
texttag = soup2.find('div', attrs={'id':'articleContent'})
|
||||
|
||||
tag = texttag.find(attrs={'class':'pages'})
|
||||
self.remove_beyond(tag, 'nextSibling')
|
||||
|
||||
newpos = len(texttag.contents)
|
||||
self.append_page(soup2,texttag,newpos)
|
||||
title = u'Kopalnia Wiedzy'
|
||||
publisher = u'Kopalnia Wiedzy'
|
||||
description = u'Ciekawostki ze świata nauki i techniki'
|
||||
encoding = 'utf-8'
|
||||
__author__ = 'Attis & Tomasz Długosz'
|
||||
language = 'pl'
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 100
|
||||
INDEX = u'http://kopalniawiedzy.pl/'
|
||||
remove_javascript = True
|
||||
no_stylesheets = True
|
||||
|
||||
appendtag.insert(position,texttag)
|
||||
remove_tags = [{'name':'p', 'attrs': {'class': 'keywords'}}, {'name':'div', 'attrs': {'class':'sexy-bookmarks sexy-bookmarks-bg-caring'}}, {'name':'div', 'attrs': {'class':'article-time-and-cat'}}, {'name':'p', 'attrs': {'class':'tags'}}]
|
||||
remove_tags_after = dict(attrs={'class':'ad-square'})
|
||||
keep_only_tags = [dict(name="div", attrs={'class':'article-text text-small'})]
|
||||
extra_css = '.topimage {margin-top: 30px}'
|
||||
|
||||
preprocess_regexps = [
|
||||
(re.compile(u'<a .* rel="lightboxText" .*><img (.*)></a>'),
|
||||
lambda match: '<img class="topimage" ' + match.group(1) + '>' ),
|
||||
(re.compile(u'<br /><br />'),
|
||||
lambda match: '<br\/>')
|
||||
]
|
||||
|
||||
feeds = [
|
||||
(u'Biologia', u'http://kopalniawiedzy.pl/wiadomosci_biologia.rss'),
|
||||
(u'Medycyna', u'http://kopalniawiedzy.pl/wiadomosci_medycyna.rss'),
|
||||
(u'Psychologia', u'http://kopalniawiedzy.pl/wiadomosci_psychologia.rss'),
|
||||
(u'Technologie', u'http://kopalniawiedzy.pl/wiadomosci_technologie.rss'),
|
||||
(u'Ciekawostki', u'http://kopalniawiedzy.pl/wiadomosci_ciekawostki.rss'),
|
||||
(u'Artykuły', u'http://kopalniawiedzy.pl/artykuly.rss')
|
||||
]
|
||||
|
||||
def is_link_wanted(self, url, tag):
|
||||
return tag['class'] == 'next'
|
||||
|
||||
def remove_beyond(self, tag, next):
|
||||
while tag is not None and getattr(tag, 'name', None) != 'body':
|
||||
after = getattr(tag, next)
|
||||
while after is not None:
|
||||
ns = getattr(tag, next)
|
||||
after.extract()
|
||||
after = ns
|
||||
tag = tag.parent
|
||||
|
||||
def append_page(self, soup, appendtag, position):
|
||||
pager = soup.find('a',attrs={'class':'next'})
|
||||
if pager:
|
||||
nexturl = self.INDEX + pager['href']
|
||||
soup2 = self.index_to_soup(nexturl)
|
||||
texttag = soup2.find('div', attrs={'id':'articleContent'})
|
||||
|
||||
tag = texttag.find(attrs={'class':'pages'})
|
||||
self.remove_beyond(tag, 'nextSibling')
|
||||
|
||||
newpos = len(texttag.contents)
|
||||
self.append_page(soup2,texttag,newpos)
|
||||
|
||||
appendtag.insert(position,texttag)
|
||||
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
self.append_page(soup, soup.body, 3)
|
||||
|
||||
for item in soup.findAll('div',attrs={'class':'pages'}):
|
||||
item.extract()
|
||||
|
||||
for item in soup.findAll('p', attrs={'class':'wykop'}):
|
||||
item.extract()
|
||||
|
||||
return soup
|
||||
def preprocess_html(self, soup):
|
||||
self.append_page(soup, soup.body, 3)
|
||||
|
||||
for item in soup.findAll('div',attrs={'class':'pages'}):
|
||||
item.extract()
|
||||
|
||||
for item in soup.findAll('p', attrs={'class':'wykop'}):
|
||||
item.extract()
|
||||
|
||||
return soup
|
||||
|
@ -1,10 +1,9 @@
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>'
|
||||
__copyright__ = '2010-2011, Darko Miletic <darko.miletic at gmail.com>'
|
||||
'''
|
||||
www.la-razon.com
|
||||
'''
|
||||
|
||||
from calibre import strftime
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class LaRazon_Bol(BasicNewsRecipe):
|
||||
@ -16,19 +15,17 @@ class LaRazon_Bol(BasicNewsRecipe):
|
||||
oldest_article = 1
|
||||
max_articles_per_feed = 200
|
||||
no_stylesheets = True
|
||||
encoding = 'cp1252'
|
||||
encoding = 'utf8'
|
||||
use_embedded_content = False
|
||||
language = 'es_BO'
|
||||
publication_type = 'newspaper'
|
||||
delay = 1
|
||||
remove_empty_feeds = True
|
||||
cover_url = strftime('http://www.la-razon.com/portadas/%Y%m%d_LaRazon.jpg')
|
||||
masthead_url = 'http://www.la-razon.com/imagenes/logo.jpg'
|
||||
extra_css = """ body{font-family: Arial,Helvetica,sans-serif }
|
||||
img{margin-bottom: 0.4em}
|
||||
.noticia-titulo{font-family: Georgia,"Times New Roman",Times,serif}
|
||||
.lead{font-weight: bold; font-size: 0.8em}
|
||||
"""
|
||||
masthead_url = 'http://www.la-razon.com/static/LRZRazon/images/lrz-logo.png'
|
||||
extra_css = """ body{font-family: Georgia,"Times New Roman",Times,serif}
|
||||
img{margin-bottom: 0.4em; display: block}
|
||||
.meta{font-size: small; font-family: Arial,Helvetica,sans-serif}
|
||||
"""
|
||||
INDEX = 'http://www.la-razon.com/'
|
||||
|
||||
conversion_options = {
|
||||
'comment' : description
|
||||
@ -37,28 +34,37 @@ class LaRazon_Bol(BasicNewsRecipe):
|
||||
, 'language' : language
|
||||
}
|
||||
|
||||
keep_only_tags = [dict(name='div', attrs={'class':['noticia-titulo','noticia-desarrollo']})]
|
||||
remove_tags = [dict(name=['meta','link','form','iframe','embed','object'])]
|
||||
keep_only_tags = [dict(name='div', attrs={'class':['pg-hd', 'pg-bd']})]
|
||||
remove_tags = [
|
||||
dict(name=['meta','link','form','iframe','embed','object'])
|
||||
,dict(name='div', attrs={'class':'bd'})
|
||||
]
|
||||
remove_attributes = ['width','height']
|
||||
|
||||
feeds = [
|
||||
(u'Editorial' , u'http://www.la-razon.com/rss_editorial.php' )
|
||||
,(u'Opinión' , u'http://www.la-razon.com/rss_opinion.php' )
|
||||
,(u'Nacional' , u'http://www.la-razon.com/rss_nacional.php' )
|
||||
,(u'Economia' , u'http://www.la-razon.com/rss_economia.php' )
|
||||
,(u'Ciudades' , u'http://www.la-razon.com/rss_ciudades.php' )
|
||||
,(u'Sociedad' , u'http://www.la-razon.com/rss_sociedad.php' )
|
||||
,(u'Mundo' , u'http://www.la-razon.com/rss_sociedad.php' )
|
||||
,(u'La Revista' , u'http://www.la-razon.com/rss_larevista.php' )
|
||||
,(u'Sociales' , u'http://www.la-razon.com/rss_sociales.php' )
|
||||
,(u'Mia' , u'http://www.la-razon.com/rss_mia.php' )
|
||||
,(u'Marcas' , u'http://www.la-razon.com/rss_marcas.php' )
|
||||
,(u'Escape' , u'http://www.la-razon.com/rss_escape.php' )
|
||||
,(u'El Financiero' , u'http://www.la-razon.com/rss_financiero.php')
|
||||
,(u'Tendencias' , u'http://www.la-razon.com/rss_tendencias.php')
|
||||
(u'Editorial' , u'http://www.la-razon.com/rss/opinion/editorial/' )
|
||||
,(u'Nacional' , u'http://www.la-razon.com/rss/nacional/' )
|
||||
,(u'Economia' , u'http://www.la-razon.com/rss/economia/' )
|
||||
,(u'Ciudades' , u'http://www.la-razon.com/rss/ciudades/' )
|
||||
,(u'Sociedad' , u'http://www.la-razon.com/rss/sociedad/' )
|
||||
,(u'Mundo' , u'http://www.la-razon.com/rss/mundo/' )
|
||||
,(u'La Revista' , u'http://www.la-razon.com/rss/la_revista/' )
|
||||
,(u'Sociales' , u'http://www.la-razon.com/rss/sociales/' )
|
||||
,(u'Mia' , u'http://www.la-razon.com/rss/suplementos/mia/' )
|
||||
,(u'Marcas' , u'http://www.la-razon.com/rss/marcas/' )
|
||||
,(u'Escape' , u'http://www.la-razon.com/rss/suplementos/escape/' )
|
||||
,(u'El Financiero' , u'http://www.la-razon.com/rss/suplementos/financiero/')
|
||||
,(u'Tendencias' , u'http://www.la-razon.com/rss/suplementos/tendencias/')
|
||||
]
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
for item in soup.findAll(style=True):
|
||||
del item['style']
|
||||
return soup
|
||||
|
||||
def get_cover_url(self):
|
||||
soup = self.index_to_soup(self.INDEX)
|
||||
lightbox = soup.find('div', attrs = {'class' : 'lightbox lightbox-frontpage'})
|
||||
return lightbox.img['src']
|
||||
|
||||
|
||||
|
14
recipes/lega_nerd.recipe
Normal file
14
recipes/lega_nerd.recipe
Normal file
@ -0,0 +1,14 @@
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class AdvancedUserRecipe1326135232(BasicNewsRecipe):
|
||||
title = u'Lega Nerd'
|
||||
description = 'nerd / geek culture, pc, comics, music, culture'
|
||||
language = 'it'
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 100
|
||||
auto_cleanup = True
|
||||
|
||||
feeds = [(u'Lega Nerd', u'http://feeds.feedburner.com/LegaNerd')]
|
||||
__author__ = 'faber1971'
|
||||
__version__ = 'v1.0'
|
||||
__date__ = '9, January 2011'
|
@ -41,7 +41,7 @@ class LosTiempos_Bol(BasicNewsRecipe):
|
||||
keep_only_tags = [dict(name='div', attrs={'id':'articulo'})]
|
||||
remove_tags = [
|
||||
dict(name=['meta','link','form','iframe','embed','object','hr'])
|
||||
,dict(attrs={'class':['caja_fonts sin_border_bot','pub']})
|
||||
,dict(attrs={'class':['caja_fonts sin_border_bot','pub','twitter-share-button']})
|
||||
]
|
||||
remove_attributes = ['width','height']
|
||||
|
||||
|
@ -14,8 +14,11 @@ class WeeklyLWN(BasicNewsRecipe):
|
||||
description = 'Weekly summary of what has happened in the free software world.'
|
||||
__author__ = 'Davide Cavalca'
|
||||
language = 'en'
|
||||
site_url = 'http://lwn.net'
|
||||
|
||||
cover_url = 'http://lwn.net/images/lcorner.png'
|
||||
extra_css = 'pre,code,samp,kbd,tt { font-size: 80% }\nblockquote {margin-left:0 }\n* { color: black }\n'
|
||||
|
||||
cover_url = site_url + '/images/lcorner.png'
|
||||
#masthead_url = 'http://lwn.net/images/lcorner.png'
|
||||
publication_type = 'magazine'
|
||||
|
||||
@ -43,11 +46,29 @@ class WeeklyLWN(BasicNewsRecipe):
|
||||
br.submit()
|
||||
return br
|
||||
|
||||
def print_version(self, url):
|
||||
|
||||
# Strip off anchor
|
||||
url = url.split('#')[0]
|
||||
|
||||
# Prepend site_url
|
||||
if url[0:len(self.site_url)] != self.site_url:
|
||||
url = self.site_url + url
|
||||
|
||||
# Append printable URL parameter
|
||||
print_param = '?format=printable'
|
||||
if url[-len(print_param):] != print_param:
|
||||
url += print_param
|
||||
|
||||
#import sys
|
||||
#print >>sys.stderr, "*** print_version(url):", url
|
||||
return url
|
||||
|
||||
def parse_index(self):
|
||||
if self.username is not None and self.password is not None:
|
||||
index_url = 'http://lwn.net/current/bigpage?format=printable'
|
||||
index_url = self.print_version('/current/bigpage')
|
||||
else:
|
||||
index_url = 'http://lwn.net/free/bigpage?format=printable'
|
||||
index_url = self.print_version('/free/bigpage')
|
||||
soup = self.index_to_soup(index_url)
|
||||
body = soup.body
|
||||
|
||||
@ -56,19 +77,19 @@ class WeeklyLWN(BasicNewsRecipe):
|
||||
url_re = re.compile('^/Articles/')
|
||||
|
||||
while True:
|
||||
tag_title = body.findNext(name='p', attrs={'class':'SummaryHL'})
|
||||
tag_title = body.findNext(attrs={'class':'SummaryHL'})
|
||||
if tag_title == None:
|
||||
break
|
||||
|
||||
tag_section = tag_title.findPrevious(name='p', attrs={'class':'Cat1HL'})
|
||||
tag_section = tag_title.findPrevious(attrs={'class':'Cat1HL'})
|
||||
if tag_section == None:
|
||||
section = 'Front Page'
|
||||
else:
|
||||
section = tag_section.string
|
||||
|
||||
tag_section2 = tag_title.findPrevious(name='p', attrs={'class':'Cat2HL'})
|
||||
tag_section2 = tag_title.findPrevious(attrs={'class':'Cat2HL'})
|
||||
if tag_section2 != None:
|
||||
if tag_section2.findPrevious(name='p', attrs={'class':'Cat1HL'}) == tag_section:
|
||||
if tag_section2.findPrevious(attrs={'class':'Cat1HL'}) == tag_section:
|
||||
section = "%s: %s" %(section, tag_section2.string)
|
||||
|
||||
if section not in articles.keys():
|
||||
@ -94,9 +115,10 @@ class WeeklyLWN(BasicNewsRecipe):
|
||||
if tag_url == None:
|
||||
break
|
||||
|
||||
|
||||
article = dict(
|
||||
title=self.tag_to_string(tag_title),
|
||||
url= 'http://lwn.net' + tag_url['href'].split('#')[0] + '?format=printable',
|
||||
url=tag_url['href'],
|
||||
description='', content='', date='')
|
||||
articles[section].append(article)
|
||||
|
||||
|
23
recipes/macity.recipe
Normal file
23
recipes/macity.recipe
Normal file
@ -0,0 +1,23 @@
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class AdvancedUserRecipe1325766771(BasicNewsRecipe):
|
||||
title = u'Macity'
|
||||
language = 'it'
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 100
|
||||
auto_cleanup = True
|
||||
|
||||
def get_article_url(self, article):
|
||||
link = BasicNewsRecipe.get_article_url(self, article)
|
||||
if link.split('/')[-1]=="story01.htm":
|
||||
link=link.split('/')[-2]
|
||||
a=['A', 'B', 'C', 'D', 'E', 'F', 'G', 'I', 'L' , 'N' , 'S' ]
|
||||
b=['0', '.', '/', '?', '-', '=', '&', '_', 'http://', '.com', 'www.']
|
||||
for i in range(0,len(a)):
|
||||
link=link.replace('0'+a[-i],b[-i])
|
||||
return link
|
||||
|
||||
feeds = [(u'Macity', u'http://www.macitynet.it.feedsportal.com/c/33714/f/599513/index.rss')]
|
||||
__author__ = 'faber1971'
|
||||
description = 'Apple and hi-tech news'
|
||||
|
@ -10,6 +10,10 @@ __MakePeriodical__ = True
|
||||
__UseChineseTitle__ = False
|
||||
# Set it to False if you want to skip images (Default: True)
|
||||
__KeepImages__ = True
|
||||
# Set it to True if you want to include a summary in Kindle's article view (Default: False)
|
||||
__IncludeSummary__ = False
|
||||
# Set it to True if you want thumbnail images in Kindle's article view (Default: True)
|
||||
__IncludeThumbnails__ = True
|
||||
# (HK only) Turn below to True if you wish to use life.mingpao.com as the main article source (Default: True)
|
||||
__UseLife__ = True
|
||||
# (HK only) It is to disable premium content (Default: False)
|
||||
@ -24,12 +28,15 @@ __Date__ = ''
|
||||
|
||||
'''
|
||||
Change Log:
|
||||
2011/12/18: update the overridden create_odf(.) routine with the one from Calibre version 0.8.31. Move __UseChineseTitle__ usage away
|
||||
from create_odf(.). Optional support of text_summary and thumbnail images in Kindle's article view. Start new day
|
||||
download of Hong Kong Mingpao at 4.30am. Set the actual publication date shown on kindle device.
|
||||
2011/12/01: take care of situation that in txt source parsing, the article content does start with special character u'\u3010'
|
||||
2011/10/21: fix a bug that hi-res img is unavailable in pages parsed from source txt
|
||||
2011/10/19: fix a bug in txt source parsing
|
||||
2011/10/17: disable fetching of premium content, also improved txt source parsing
|
||||
2011/10/04: option to get hi-res photos for the articles
|
||||
2011/09/21: fetching "column" section is made optional.
|
||||
2011/09/21: fetching "column" section is made optional.
|
||||
2011/09/18: parse "column" section stuff from source text file directly.
|
||||
2011/09/07: disable "column" section as it is no longer offered free.
|
||||
2011/06/26: add fetching Vancouver and Toronto versions of the paper, also provide captions for images using life.mingpao fetch source
|
||||
@ -53,6 +60,7 @@ Change Log:
|
||||
2010/10/31: skip repeated articles in section pages
|
||||
'''
|
||||
|
||||
from calibre.utils.date import now as nowf
|
||||
import os, datetime, re, mechanize
|
||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||
from contextlib import nested
|
||||
@ -60,11 +68,15 @@ from calibre.ebooks.BeautifulSoup import BeautifulSoup
|
||||
from calibre.ebooks.metadata.opf2 import OPFCreator
|
||||
from calibre.ebooks.metadata.toc import TOC
|
||||
from calibre.ebooks.metadata import MetaInformation
|
||||
from calibre.utils.localization import canonicalize_lang
|
||||
|
||||
# MAIN CLASS
|
||||
class MPRecipe(BasicNewsRecipe):
|
||||
if __Region__ == 'Hong Kong':
|
||||
title = 'Ming Pao - Hong Kong'
|
||||
if __UseChineseTitle__ == True:
|
||||
title = u'\u660e\u5831 (\u9999\u6e2f)'
|
||||
else:
|
||||
title = 'Ming Pao - Hong Kong'
|
||||
description = 'Hong Kong Chinese Newspaper (http://news.mingpao.com)'
|
||||
category = 'Chinese, News, Hong Kong'
|
||||
extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px; max-height:90%;} font>b {font-size:200%; font-weight:bold;} div[class=heading] {font-size:200%; font-weight:bold;} div[class=images] {font-size:50%;}'
|
||||
@ -109,7 +121,10 @@ class MPRecipe(BasicNewsRecipe):
|
||||
lambda match: "</b>")
|
||||
]
|
||||
elif __Region__ == 'Vancouver':
|
||||
title = 'Ming Pao - Vancouver'
|
||||
if __UseChineseTitle__ == True:
|
||||
title = u'\u660e\u5831 (\u6eab\u54e5\u83ef)'
|
||||
else:
|
||||
title = 'Ming Pao - Vancouver'
|
||||
description = 'Vancouver Chinese Newspaper (http://www.mingpaovan.com)'
|
||||
category = 'Chinese, News, Vancouver'
|
||||
extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px;} b>font {font-size:200%; font-weight:bold;}'
|
||||
@ -127,7 +142,10 @@ class MPRecipe(BasicNewsRecipe):
|
||||
lambda match: ''),
|
||||
]
|
||||
elif __Region__ == 'Toronto':
|
||||
title = 'Ming Pao - Toronto'
|
||||
if __UseChineseTitle__ == True:
|
||||
title = u'\u660e\u5831 (\u591a\u502b\u591a)'
|
||||
else:
|
||||
title = 'Ming Pao - Toronto'
|
||||
description = 'Toronto Chinese Newspaper (http://www.mingpaotor.com)'
|
||||
category = 'Chinese, News, Toronto'
|
||||
extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px;} b>font {font-size:200%; font-weight:bold;}'
|
||||
@ -161,9 +179,9 @@ class MPRecipe(BasicNewsRecipe):
|
||||
def get_dtlocal(self):
|
||||
dt_utc = datetime.datetime.utcnow()
|
||||
if __Region__ == 'Hong Kong':
|
||||
# convert UTC to local hk time - at HKT 5.30am, all news are available
|
||||
dt_local = dt_utc + datetime.timedelta(8.0/24) - datetime.timedelta(5.5/24)
|
||||
# dt_local = dt_utc.astimezone(pytz.timezone('Asia/Hong_Kong')) - datetime.timedelta(5.5/24)
|
||||
# convert UTC to local hk time - at HKT 4.30am, all news are available
|
||||
dt_local = dt_utc + datetime.timedelta(8.0/24) - datetime.timedelta(4.5/24)
|
||||
# dt_local = dt_utc.astimezone(pytz.timezone('Asia/Hong_Kong')) - datetime.timedelta(4.5/24)
|
||||
elif __Region__ == 'Vancouver':
|
||||
# convert UTC to local Vancouver time - at PST time 5.30am, all news are available
|
||||
dt_local = dt_utc + datetime.timedelta(-8.0/24) - datetime.timedelta(5.5/24)
|
||||
@ -186,6 +204,18 @@ class MPRecipe(BasicNewsRecipe):
|
||||
else:
|
||||
return self.get_dtlocal().strftime("%Y-%m-%d")
|
||||
|
||||
def get_fetchyear(self):
|
||||
if __Date__ <> '':
|
||||
return __Date__[0:4]
|
||||
else:
|
||||
return self.get_dtlocal().strftime("%Y")
|
||||
|
||||
def get_fetchmonth(self):
|
||||
if __Date__ <> '':
|
||||
return __Date__[4:6]
|
||||
else:
|
||||
return self.get_dtlocal().strftime("%m")
|
||||
|
||||
def get_fetchday(self):
|
||||
if __Date__ <> '':
|
||||
return __Date__[6:8]
|
||||
@ -237,7 +267,7 @@ class MPRecipe(BasicNewsRecipe):
|
||||
articles = self.parse_section2_txt(url, keystr)
|
||||
if articles:
|
||||
feeds.append((title, articles))
|
||||
|
||||
|
||||
for title, url in [(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'),
|
||||
(u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]:
|
||||
articles = self.parse_section(url)
|
||||
@ -274,7 +304,7 @@ class MPRecipe(BasicNewsRecipe):
|
||||
articles = self.parse_section2_txt(url, keystr)
|
||||
if articles:
|
||||
feeds.append((title, articles))
|
||||
|
||||
|
||||
#for title, url in [('Tech News', 'http://news.mingpao.com/' + dateStr + '/naindex.htm'),
|
||||
# (u'\u9ad4\u80b2 Sport', 'http://news.mingpao.com/' + dateStr + '/spindex.htm')]:
|
||||
# articles = self.parse_section(url)
|
||||
@ -291,7 +321,7 @@ class MPRecipe(BasicNewsRecipe):
|
||||
articles = self.parse_section2_txt(url, keystr)
|
||||
if articles:
|
||||
feeds.append((title, articles))
|
||||
|
||||
|
||||
if __InclPremium__ == True:
|
||||
# parse column section articles directly from .txt files
|
||||
for title, url, keystr in [(u'\u5c08\u6b04 Columns', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=ncolumn', 'ncl')
|
||||
@ -299,7 +329,7 @@ class MPRecipe(BasicNewsRecipe):
|
||||
articles = self.parse_section2_txt(url, keystr)
|
||||
if articles:
|
||||
feeds.append((title, articles))
|
||||
|
||||
|
||||
for title, url in [(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'),
|
||||
(u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]:
|
||||
articles = self.parse_section(url)
|
||||
@ -379,7 +409,7 @@ class MPRecipe(BasicNewsRecipe):
|
||||
title = self.tag_to_string(i)
|
||||
url = 'http://life.mingpao.com/cfm/' + i.get('href', False)
|
||||
if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind(keystr) == -1):
|
||||
try:
|
||||
try:
|
||||
br.open_novisit(url)
|
||||
url = url.replace('dailynews3.cfm', 'dailynews3a.cfm') # use printed version of the article
|
||||
current_articles.append({'title': title, 'url': url, 'description': ''})
|
||||
@ -406,7 +436,7 @@ class MPRecipe(BasicNewsRecipe):
|
||||
included_urls.append(url)
|
||||
current_articles.reverse()
|
||||
return current_articles
|
||||
|
||||
|
||||
# parse from www.mingpaovan.com
|
||||
def parse_section3(self, url, baseUrl):
|
||||
self.get_fetchdate()
|
||||
@ -528,7 +558,7 @@ class MPRecipe(BasicNewsRecipe):
|
||||
photo = photo.replace('class="photo"', '')
|
||||
new_raw_html = new_raw_html + '<div class="images">' + photo + '</div>'
|
||||
new_html = new_raw_html + '</body></html>'
|
||||
else:
|
||||
else:
|
||||
# .txt based file
|
||||
splitter = re.compile(r'\n') # Match non-digits
|
||||
new_raw_html = '<html><head><title>Untitled</title></head><body><div class="images">'
|
||||
@ -591,23 +621,23 @@ class MPRecipe(BasicNewsRecipe):
|
||||
#raw_html = raw_html.replace(u'<p>\u3010', u'\u3010')
|
||||
if __HiResImg__ == True:
|
||||
# TODO: add a _ in front of an image url
|
||||
if url.rfind('news.mingpao.com') > -1:
|
||||
if url.rfind('news.mingpao.com') > -1:
|
||||
imglist = re.findall('src="?.*?jpg"', new_html)
|
||||
br = mechanize.Browser()
|
||||
br.set_handle_redirect(False)
|
||||
for img in imglist:
|
||||
gifimg = img.replace('jpg"', 'gif"')
|
||||
try:
|
||||
try:
|
||||
br.open_novisit(url + "/../" + gifimg[5:len(gifimg)-1])
|
||||
new_html = new_html.replace(img, gifimg)
|
||||
except:
|
||||
except:
|
||||
# find the location of the first _
|
||||
pos = img.find('_')
|
||||
if pos > -1:
|
||||
# if found, insert _ after the first _
|
||||
newimg = img[0:pos] + '_' + img[pos:]
|
||||
new_html = new_html.replace(img, newimg)
|
||||
else:
|
||||
else:
|
||||
# if not found, insert _ after "
|
||||
new_html = new_html.replace(img[1:], '"_' + img[1:])
|
||||
elif url.rfind('life.mingpao.com') > -1:
|
||||
@ -644,7 +674,7 @@ class MPRecipe(BasicNewsRecipe):
|
||||
#print 'Use hi-res img', newimg
|
||||
new_html = new_html.replace(img, newimg)
|
||||
return new_html
|
||||
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
for item in soup.findAll(style=True):
|
||||
del item['style']
|
||||
@ -653,78 +683,154 @@ class MPRecipe(BasicNewsRecipe):
|
||||
for item in soup.findAll(stype=True):
|
||||
del item['absmiddle']
|
||||
return soup
|
||||
|
||||
|
||||
def populate_article_metadata(self, article, soup, first):
|
||||
# thumbnails shouldn't be available if using hi-res images
|
||||
if __IncludeThumbnails__ and __HiResImg__ == False and first and hasattr(self, 'add_toc_thumbnail'):
|
||||
img = soup.find('img')
|
||||
if img is not None:
|
||||
self.add_toc_thumbnail(article, img['src'])
|
||||
|
||||
try:
|
||||
if __IncludeSummary__ and len(article.text_summary.strip()) == 0:
|
||||
# look for content
|
||||
articlebodies = soup.findAll('div',attrs={'id':'newscontent'})
|
||||
if not articlebodies:
|
||||
articlebodies = soup.findAll('div',attrs={'id':'newscontent01'})
|
||||
if not articlebodies:
|
||||
articlebodies = soup.findAll('div',attrs={'class':'content'})
|
||||
if not articlebodies:
|
||||
articlebodies = soup.findAll('div', attrs={'id':'font'})
|
||||
if articlebodies:
|
||||
for articlebody in articlebodies:
|
||||
if articlebody:
|
||||
# the text may or may not be enclosed in <p></p> tag
|
||||
paras = articlebody.findAll('p')
|
||||
if not paras:
|
||||
paras = articlebody
|
||||
textFound = False
|
||||
for p in paras:
|
||||
if not textFound:
|
||||
summary_candidate = self.tag_to_string(p).strip()
|
||||
summary_candidate = summary_candidate.replace(u'\u3010\u660e\u5831\u5c08\u8a0a\u3011', '', 1)
|
||||
if len(summary_candidate) > 0:
|
||||
article.summary = article.text_summary = summary_candidate
|
||||
textFound = True
|
||||
else:
|
||||
# display a simple text
|
||||
#article.summary = article.text_summary = u'\u66f4\u591a......'
|
||||
# display word counts
|
||||
counts = 0
|
||||
articlebodies = soup.findAll('div',attrs={'id':'newscontent'})
|
||||
if not articlebodies:
|
||||
articlebodies = soup.findAll('div',attrs={'id':'newscontent01'})
|
||||
if not articlebodies:
|
||||
articlebodies = soup.findAll('div',attrs={'class':'content'})
|
||||
if not articlebodies:
|
||||
articlebodies = soup.findAll('div', attrs={'id':'font'})
|
||||
if articlebodies:
|
||||
for articlebody in articlebodies:
|
||||
# the text may or may not be enclosed in <p></p> tag
|
||||
paras = articlebody.findAll('p')
|
||||
if not paras:
|
||||
paras = articlebody
|
||||
for p in paras:
|
||||
summary_candidate = self.tag_to_string(p).strip()
|
||||
counts += len(summary_candidate)
|
||||
article.summary = article.text_summary = u'\uff08' + str(counts) + u'\u5b57\uff09'
|
||||
except:
|
||||
self.log("Error creating article descriptions")
|
||||
return
|
||||
|
||||
# override from the one in version 0.8.31
|
||||
def create_opf(self, feeds, dir=None):
|
||||
if dir is None:
|
||||
dir = self.output_dir
|
||||
if __UseChineseTitle__ == True:
|
||||
if __Region__ == 'Hong Kong':
|
||||
title = u'\u660e\u5831 (\u9999\u6e2f)'
|
||||
elif __Region__ == 'Vancouver':
|
||||
title = u'\u660e\u5831 (\u6eab\u54e5\u83ef)'
|
||||
elif __Region__ == 'Toronto':
|
||||
title = u'\u660e\u5831 (\u591a\u502b\u591a)'
|
||||
else:
|
||||
title = self.short_title()
|
||||
# if not generating a periodical, force date to apply in title
|
||||
if __MakePeriodical__ == False:
|
||||
title = self.short_title()
|
||||
# change 1: allow our own flag to tell if a periodical is to be generated
|
||||
# also use customed date instead of current time
|
||||
if __MakePeriodical__ == False or self.output_profile.periodical_date_in_title:
|
||||
title = title + ' ' + self.get_fetchformatteddate()
|
||||
if True:
|
||||
mi = MetaInformation(title, [self.publisher])
|
||||
mi.publisher = self.publisher
|
||||
mi.author_sort = self.publisher
|
||||
if __MakePeriodical__ == True:
|
||||
mi.publication_type = 'periodical:'+self.publication_type+':'+self.short_title()
|
||||
else:
|
||||
mi.publication_type = self.publication_type+':'+self.short_title()
|
||||
#mi.timestamp = nowf()
|
||||
mi.timestamp = self.get_dtlocal()
|
||||
mi.comments = self.description
|
||||
if not isinstance(mi.comments, unicode):
|
||||
mi.comments = mi.comments.decode('utf-8', 'replace')
|
||||
#mi.pubdate = nowf()
|
||||
mi.pubdate = self.get_dtlocal()
|
||||
opf_path = os.path.join(dir, 'index.opf')
|
||||
ncx_path = os.path.join(dir, 'index.ncx')
|
||||
opf = OPFCreator(dir, mi)
|
||||
# Add mastheadImage entry to <guide> section
|
||||
mp = getattr(self, 'masthead_path', None)
|
||||
if mp is not None and os.access(mp, os.R_OK):
|
||||
from calibre.ebooks.metadata.opf2 import Guide
|
||||
ref = Guide.Reference(os.path.basename(self.masthead_path), os.getcwdu())
|
||||
ref.type = 'masthead'
|
||||
ref.title = 'Masthead Image'
|
||||
opf.guide.append(ref)
|
||||
# end of change 1
|
||||
# change 2: __appname__ replaced by newspaper publisher
|
||||
__appname__ = self.publisher
|
||||
mi = MetaInformation(title, [__appname__])
|
||||
mi.publisher = __appname__
|
||||
mi.author_sort = __appname__
|
||||
# change 3: use __MakePeriodical__ flag to tell if a periodical should be generated
|
||||
if __MakePeriodical__ == True:
|
||||
mi.publication_type = 'periodical:'+self.publication_type+':'+self.short_title()
|
||||
else:
|
||||
mi.publication_type = self.publication_type+':'+self.short_title()
|
||||
#mi.publication_type = 'periodical:'+self.publication_type+':'+self.short_title()
|
||||
# change 4: in the following, all the nowf() are changed to adjusted time
|
||||
# This one doesn't matter
|
||||
mi.timestamp = nowf()
|
||||
# change 5: skip listing the articles
|
||||
#article_titles, aseen = [], set()
|
||||
#for f in feeds:
|
||||
# for a in f:
|
||||
# if a.title and a.title not in aseen:
|
||||
# aseen.add(a.title)
|
||||
# article_titles.append(force_unicode(a.title, 'utf-8'))
|
||||
|
||||
manifest = [os.path.join(dir, 'feed_%d'%i) for i in range(len(feeds))]
|
||||
manifest.append(os.path.join(dir, 'index.html'))
|
||||
manifest.append(os.path.join(dir, 'index.ncx'))
|
||||
#mi.comments = self.description
|
||||
#if not isinstance(mi.comments, unicode):
|
||||
# mi.comments = mi.comments.decode('utf-8', 'replace')
|
||||
#mi.comments += ('\n\n' + _('Articles in this issue: ') + '\n' +
|
||||
# '\n\n'.join(article_titles))
|
||||
|
||||
# Get cover
|
||||
cpath = getattr(self, 'cover_path', None)
|
||||
if cpath is None:
|
||||
pf = open(os.path.join(dir, 'cover.jpg'), 'wb')
|
||||
if self.default_cover(pf):
|
||||
cpath = pf.name
|
||||
if cpath is not None and os.access(cpath, os.R_OK):
|
||||
opf.cover = cpath
|
||||
manifest.append(cpath)
|
||||
language = canonicalize_lang(self.language)
|
||||
if language is not None:
|
||||
mi.language = language
|
||||
# This one affects the pub date shown in kindle title
|
||||
#mi.pubdate = nowf()
|
||||
# now appears to need the time field to be > 12.00noon as well
|
||||
mi.pubdate = datetime.datetime(int(self.get_fetchyear()), int(self.get_fetchmonth()), int(self.get_fetchday()), 12, 30, 0)
|
||||
opf_path = os.path.join(dir, 'index.opf')
|
||||
ncx_path = os.path.join(dir, 'index.ncx')
|
||||
|
||||
# Get masthead
|
||||
mpath = getattr(self, 'masthead_path', None)
|
||||
if mpath is not None and os.access(mpath, os.R_OK):
|
||||
manifest.append(mpath)
|
||||
opf = OPFCreator(dir, mi)
|
||||
# Add mastheadImage entry to <guide> section
|
||||
mp = getattr(self, 'masthead_path', None)
|
||||
if mp is not None and os.access(mp, os.R_OK):
|
||||
from calibre.ebooks.metadata.opf2 import Guide
|
||||
ref = Guide.Reference(os.path.basename(self.masthead_path), os.getcwdu())
|
||||
ref.type = 'masthead'
|
||||
ref.title = 'Masthead Image'
|
||||
opf.guide.append(ref)
|
||||
|
||||
manifest = [os.path.join(dir, 'feed_%d'%i) for i in range(len(feeds))]
|
||||
manifest.append(os.path.join(dir, 'index.html'))
|
||||
manifest.append(os.path.join(dir, 'index.ncx'))
|
||||
|
||||
# Get cover
|
||||
cpath = getattr(self, 'cover_path', None)
|
||||
if cpath is None:
|
||||
pf = open(os.path.join(dir, 'cover.jpg'), 'wb')
|
||||
if self.default_cover(pf):
|
||||
cpath = pf.name
|
||||
if cpath is not None and os.access(cpath, os.R_OK):
|
||||
opf.cover = cpath
|
||||
manifest.append(cpath)
|
||||
|
||||
# Get masthead
|
||||
mpath = getattr(self, 'masthead_path', None)
|
||||
if mpath is not None and os.access(mpath, os.R_OK):
|
||||
manifest.append(mpath)
|
||||
|
||||
opf.create_manifest_from_files_in(manifest)
|
||||
for mani in opf.manifest:
|
||||
if mani.path.endswith('.ncx'):
|
||||
mani.id = 'ncx'
|
||||
if mani.path.endswith('mastheadImage.jpg'):
|
||||
mani.id = 'masthead-image'
|
||||
|
||||
entries = ['index.html']
|
||||
toc = TOC(base_path=dir)
|
||||
self.play_order_counter = 0
|
||||
self.play_order_map = {}
|
||||
|
||||
opf.create_manifest_from_files_in(manifest)
|
||||
for mani in opf.manifest:
|
||||
if mani.path.endswith('.ncx'):
|
||||
mani.id = 'ncx'
|
||||
if mani.path.endswith('mastheadImage.jpg'):
|
||||
mani.id = 'masthead-image'
|
||||
entries = ['index.html']
|
||||
toc = TOC(base_path=dir)
|
||||
self.play_order_counter = 0
|
||||
self.play_order_map = {}
|
||||
|
||||
def feed_index(num, parent):
|
||||
f = feeds[num]
|
||||
@ -739,13 +845,16 @@ class MPRecipe(BasicNewsRecipe):
|
||||
desc = None
|
||||
else:
|
||||
desc = self.description_limiter(desc)
|
||||
tt = a.toc_thumbnail if a.toc_thumbnail else None
|
||||
entries.append('%sindex.html'%adir)
|
||||
po = self.play_order_map.get(entries[-1], None)
|
||||
if po is None:
|
||||
self.play_order_counter += 1
|
||||
po = self.play_order_counter
|
||||
parent.add_item('%sindex.html'%adir, None, a.title if a.title else _('Untitled Article'),
|
||||
play_order=po, author=auth, description=desc)
|
||||
parent.add_item('%sindex.html'%adir, None,
|
||||
a.title if a.title else _('Untitled Article'),
|
||||
play_order=po, author=auth,
|
||||
description=desc, toc_thumbnail=tt)
|
||||
last = os.path.join(self.output_dir, ('%sindex.html'%adir).replace('/', os.sep))
|
||||
for sp in a.sub_pages:
|
||||
prefix = os.path.commonprefix([opf_path, sp])
|
||||
@ -762,7 +871,7 @@ class MPRecipe(BasicNewsRecipe):
|
||||
prefix = '/'.join('..'for i in range(2*len(re.findall(r'link\d+', last))))
|
||||
templ = self.navbar.generate(True, num, j, len(f),
|
||||
not self.has_single_feed,
|
||||
a.orig_url, self.publisher, prefix=prefix,
|
||||
a.orig_url, __appname__, prefix=prefix,
|
||||
center=self.center_navbar)
|
||||
elem = BeautifulSoup(templ.render(doctype='xhtml').decode('utf-8')).find('div')
|
||||
body.insert(len(body.contents), elem)
|
||||
@ -785,7 +894,7 @@ class MPRecipe(BasicNewsRecipe):
|
||||
if not desc:
|
||||
desc = None
|
||||
feed_index(i, toc.add_item('feed_%d/index.html'%i, None,
|
||||
f.title, play_order=po, description=desc, author=auth))
|
||||
f.title, play_order=po, description=desc, author=auth))
|
||||
|
||||
else:
|
||||
entries.append('feed_%d/index.html'%0)
|
||||
@ -799,3 +908,4 @@ class MPRecipe(BasicNewsRecipe):
|
||||
with nested(open(opf_path, 'wb'), open(ncx_path, 'wb')) as (opf_file, ncx_file):
|
||||
opf.render(opf_file, ncx_file)
|
||||
|
||||
|
||||
|
@ -4,18 +4,41 @@ __copyright__ = '2010-2011, Eddie Lau'
|
||||
# Region - Hong Kong, Vancouver, Toronto
|
||||
__Region__ = 'Toronto'
|
||||
# Users of Kindle 3 with limited system-level CJK support
|
||||
# please replace the following "True" with "False".
|
||||
# please replace the following "True" with "False". (Default: True)
|
||||
__MakePeriodical__ = True
|
||||
# Turn below to true if your device supports display of CJK titles
|
||||
# Turn below to True if your device supports display of CJK titles (Default: False)
|
||||
__UseChineseTitle__ = False
|
||||
# Set it to False if you want to skip images
|
||||
# Set it to False if you want to skip images (Default: True)
|
||||
__KeepImages__ = True
|
||||
# (HK only) Turn below to true if you wish to use life.mingpao.com as the main article source
|
||||
# Set it to True if you want to include a summary in Kindle's article view (Default: False)
|
||||
__IncludeSummary__ = False
|
||||
# Set it to True if you want thumbnail images in Kindle's article view (Default: True)
|
||||
__IncludeThumbnails__ = True
|
||||
# (HK only) Turn below to True if you wish to use life.mingpao.com as the main article source (Default: True)
|
||||
__UseLife__ = True
|
||||
# (HK only) It is to disable premium content (Default: False)
|
||||
__InclPremium__ = False
|
||||
# (HK only) Turn below to True if you wish to parse articles in news.mingpao.com with their printer-friendly formats (Default: True)
|
||||
__ParsePFF__ = True
|
||||
# (HK only) Turn below to True if you wish hi-res images (Default: False)
|
||||
__HiResImg__ = False
|
||||
# Override the date returned by the program if specifying a YYYYMMDD below
|
||||
__Date__ = ''
|
||||
|
||||
|
||||
'''
|
||||
Change Log:
|
||||
2011/12/18: update the overridden create_odf(.) routine with the one from Calibre version 0.8.31. Move __UseChineseTitle__ usage away
|
||||
from create_odf(.). Optional support of text_summary and thumbnail images in Kindle's article view. Start new day
|
||||
download of Hong Kong Mingpao at 4.30am. Set the actual publication date shown on kindle device.
|
||||
2011/12/01: take care of situation that in txt source parsing, the article content does start with special character u'\u3010'
|
||||
2011/10/21: fix a bug that hi-res img is unavailable in pages parsed from source txt
|
||||
2011/10/19: fix a bug in txt source parsing
|
||||
2011/10/17: disable fetching of premium content, also improved txt source parsing
|
||||
2011/10/04: option to get hi-res photos for the articles
|
||||
2011/09/21: fetching "column" section is made optional.
|
||||
2011/09/18: parse "column" section stuff from source text file directly.
|
||||
2011/09/07: disable "column" section as it is no longer offered free.
|
||||
2011/06/26: add fetching Vancouver and Toronto versions of the paper, also provide captions for images using life.mingpao fetch source
|
||||
provide options to remove all images in the file
|
||||
2011/05/12: switch the main parse source to life.mingpao.com, which has more photos on the article pages
|
||||
@ -37,30 +60,38 @@ Change Log:
|
||||
2010/10/31: skip repeated articles in section pages
|
||||
'''
|
||||
|
||||
import os, datetime, re
|
||||
from calibre.utils.date import now as nowf
|
||||
import os, datetime, re, mechanize
|
||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||
from contextlib import nested
|
||||
from calibre.ebooks.BeautifulSoup import BeautifulSoup
|
||||
from calibre.ebooks.metadata.opf2 import OPFCreator
|
||||
from calibre.ebooks.metadata.toc import TOC
|
||||
from calibre.ebooks.metadata import MetaInformation
|
||||
from calibre.utils.localization import canonicalize_lang
|
||||
|
||||
# MAIN CLASS
|
||||
class MPRecipe(BasicNewsRecipe):
|
||||
if __Region__ == 'Hong Kong':
|
||||
title = 'Ming Pao - Hong Kong'
|
||||
if __UseChineseTitle__ == True:
|
||||
title = u'\u660e\u5831 (\u9999\u6e2f)'
|
||||
else:
|
||||
title = 'Ming Pao - Hong Kong'
|
||||
description = 'Hong Kong Chinese Newspaper (http://news.mingpao.com)'
|
||||
category = 'Chinese, News, Hong Kong'
|
||||
extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px;} font>b {font-size:200%; font-weight:bold;}'
|
||||
extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px; max-height:90%;} font>b {font-size:200%; font-weight:bold;} div[class=heading] {font-size:200%; font-weight:bold;} div[class=images] {font-size:50%;}'
|
||||
masthead_url = 'http://news.mingpao.com/image/portals_top_logo_news.gif'
|
||||
keep_only_tags = [dict(name='h1'),
|
||||
dict(name='font', attrs={'style':['font-size:14pt; line-height:160%;']}), # for entertainment page title
|
||||
dict(name='font', attrs={'color':['AA0000']}), # for column articles title
|
||||
dict(attrs={'class':['heading']}), # for heading from txt
|
||||
dict(attrs={'id':['newscontent']}), # entertainment and column page content
|
||||
dict(attrs={'id':['newscontent01','newscontent02']}),
|
||||
dict(attrs={'class':['content']}), # for content from txt
|
||||
dict(attrs={'class':['photo']}),
|
||||
dict(name='table', attrs={'width':['100%'], 'border':['0'], 'cellspacing':['5'], 'cellpadding':['0']}), # content in printed version of life.mingpao.com
|
||||
dict(name='img', attrs={'width':['180'], 'alt':['按圖放大']}) # images for source from life.mingpao.com
|
||||
dict(name='img', attrs={'width':['180'], 'alt':['????']}), # images for source from life.mingpao.com
|
||||
dict(attrs={'class':['images']}) # for images from txt
|
||||
]
|
||||
if __KeepImages__:
|
||||
remove_tags = [dict(name='style'),
|
||||
@ -90,7 +121,10 @@ class MPRecipe(BasicNewsRecipe):
|
||||
lambda match: "</b>")
|
||||
]
|
||||
elif __Region__ == 'Vancouver':
|
||||
title = 'Ming Pao - Vancouver'
|
||||
if __UseChineseTitle__ == True:
|
||||
title = u'\u660e\u5831 (\u6eab\u54e5\u83ef)'
|
||||
else:
|
||||
title = 'Ming Pao - Vancouver'
|
||||
description = 'Vancouver Chinese Newspaper (http://www.mingpaovan.com)'
|
||||
category = 'Chinese, News, Vancouver'
|
||||
extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px;} b>font {font-size:200%; font-weight:bold;}'
|
||||
@ -108,7 +142,10 @@ class MPRecipe(BasicNewsRecipe):
|
||||
lambda match: ''),
|
||||
]
|
||||
elif __Region__ == 'Toronto':
|
||||
title = 'Ming Pao - Toronto'
|
||||
if __UseChineseTitle__ == True:
|
||||
title = u'\u660e\u5831 (\u591a\u502b\u591a)'
|
||||
else:
|
||||
title = 'Ming Pao - Toronto'
|
||||
description = 'Toronto Chinese Newspaper (http://www.mingpaotor.com)'
|
||||
category = 'Chinese, News, Toronto'
|
||||
extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px;} b>font {font-size:200%; font-weight:bold;}'
|
||||
@ -139,49 +176,12 @@ class MPRecipe(BasicNewsRecipe):
|
||||
conversion_options = {'linearize_tables':True}
|
||||
timefmt = ''
|
||||
|
||||
def image_url_processor(cls, baseurl, url):
|
||||
# trick: break the url at the first occurance of digit, add an additional
|
||||
# '_' at the front
|
||||
# not working, may need to move this to preprocess_html() method
|
||||
# minIdx = 10000
|
||||
# i0 = url.find('0')
|
||||
# if i0 >= 0 and i0 < minIdx:
|
||||
# minIdx = i0
|
||||
# i1 = url.find('1')
|
||||
# if i1 >= 0 and i1 < minIdx:
|
||||
# minIdx = i1
|
||||
# i2 = url.find('2')
|
||||
# if i2 >= 0 and i2 < minIdx:
|
||||
# minIdx = i2
|
||||
# i3 = url.find('3')
|
||||
# if i3 >= 0 and i0 < minIdx:
|
||||
# minIdx = i3
|
||||
# i4 = url.find('4')
|
||||
# if i4 >= 0 and i4 < minIdx:
|
||||
# minIdx = i4
|
||||
# i5 = url.find('5')
|
||||
# if i5 >= 0 and i5 < minIdx:
|
||||
# minIdx = i5
|
||||
# i6 = url.find('6')
|
||||
# if i6 >= 0 and i6 < minIdx:
|
||||
# minIdx = i6
|
||||
# i7 = url.find('7')
|
||||
# if i7 >= 0 and i7 < minIdx:
|
||||
# minIdx = i7
|
||||
# i8 = url.find('8')
|
||||
# if i8 >= 0 and i8 < minIdx:
|
||||
# minIdx = i8
|
||||
# i9 = url.find('9')
|
||||
# if i9 >= 0 and i9 < minIdx:
|
||||
# minIdx = i9
|
||||
return url
|
||||
|
||||
def get_dtlocal(self):
|
||||
dt_utc = datetime.datetime.utcnow()
|
||||
if __Region__ == 'Hong Kong':
|
||||
# convert UTC to local hk time - at HKT 5.30am, all news are available
|
||||
dt_local = dt_utc + datetime.timedelta(8.0/24) - datetime.timedelta(5.5/24)
|
||||
# dt_local = dt_utc.astimezone(pytz.timezone('Asia/Hong_Kong')) - datetime.timedelta(5.5/24)
|
||||
# convert UTC to local hk time - at HKT 4.30am, all news are available
|
||||
dt_local = dt_utc + datetime.timedelta(8.0/24) - datetime.timedelta(4.5/24)
|
||||
# dt_local = dt_utc.astimezone(pytz.timezone('Asia/Hong_Kong')) - datetime.timedelta(4.5/24)
|
||||
elif __Region__ == 'Vancouver':
|
||||
# convert UTC to local Vancouver time - at PST time 5.30am, all news are available
|
||||
dt_local = dt_utc + datetime.timedelta(-8.0/24) - datetime.timedelta(5.5/24)
|
||||
@ -193,13 +193,34 @@ class MPRecipe(BasicNewsRecipe):
|
||||
return dt_local
|
||||
|
||||
def get_fetchdate(self):
|
||||
return self.get_dtlocal().strftime("%Y%m%d")
|
||||
if __Date__ <> '':
|
||||
return __Date__
|
||||
else:
|
||||
return self.get_dtlocal().strftime("%Y%m%d")
|
||||
|
||||
def get_fetchformatteddate(self):
|
||||
return self.get_dtlocal().strftime("%Y-%m-%d")
|
||||
if __Date__ <> '':
|
||||
return __Date__[0:4]+'-'+__Date__[4:6]+'-'+__Date__[6:8]
|
||||
else:
|
||||
return self.get_dtlocal().strftime("%Y-%m-%d")
|
||||
|
||||
def get_fetchyear(self):
|
||||
if __Date__ <> '':
|
||||
return __Date__[0:4]
|
||||
else:
|
||||
return self.get_dtlocal().strftime("%Y")
|
||||
|
||||
def get_fetchmonth(self):
|
||||
if __Date__ <> '':
|
||||
return __Date__[4:6]
|
||||
else:
|
||||
return self.get_dtlocal().strftime("%m")
|
||||
|
||||
def get_fetchday(self):
|
||||
return self.get_dtlocal().strftime("%d")
|
||||
if __Date__ <> '':
|
||||
return __Date__[6:8]
|
||||
else:
|
||||
return self.get_dtlocal().strftime("%d")
|
||||
|
||||
def get_cover_url(self):
|
||||
if __Region__ == 'Hong Kong':
|
||||
@ -230,12 +251,23 @@ class MPRecipe(BasicNewsRecipe):
|
||||
(u'\u570b\u969b World', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalta', 'nal'),
|
||||
(u'\u7d93\u6fdf Finance', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalea', 'nal'),
|
||||
(u'\u9ad4\u80b2 Sport', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalsp', 'nal'),
|
||||
(u'\u5f71\u8996 Film/TV', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalma', 'nal'),
|
||||
(u'\u5c08\u6b04 Columns', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=ncolumn', 'ncl')]:
|
||||
articles = self.parse_section2(url, keystr)
|
||||
(u'\u5f71\u8996 Film/TV', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalma', 'nal')
|
||||
]:
|
||||
if __InclPremium__ == True:
|
||||
articles = self.parse_section2_txt(url, keystr)
|
||||
else:
|
||||
articles = self.parse_section2(url, keystr)
|
||||
if articles:
|
||||
feeds.append((title, articles))
|
||||
|
||||
if __InclPremium__ == True:
|
||||
# parse column section articles directly from .txt files
|
||||
for title, url, keystr in [(u'\u5c08\u6b04 Columns', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=ncolumn', 'ncl')
|
||||
]:
|
||||
articles = self.parse_section2_txt(url, keystr)
|
||||
if articles:
|
||||
feeds.append((title, articles))
|
||||
|
||||
for title, url in [(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'),
|
||||
(u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]:
|
||||
articles = self.parse_section(url)
|
||||
@ -244,15 +276,16 @@ class MPRecipe(BasicNewsRecipe):
|
||||
else:
|
||||
for title, url in [(u'\u8981\u805e Headline', 'http://news.mingpao.com/' + dateStr + '/gaindex.htm'),
|
||||
(u'\u6e2f\u805e Local', 'http://news.mingpao.com/' + dateStr + '/gbindex.htm'),
|
||||
(u'\u6559\u80b2 Education', 'http://news.mingpao.com/' + dateStr + '/gfindex.htm')]:
|
||||
(u'\u6559\u80b2 Education', 'http://news.mingpao.com/' + dateStr + '/gfindex.htm'),
|
||||
(u'\u793e\u8a55/\u7b46\u9663 Editorial', 'http://news.mingpao.com/' + dateStr + '/mrindex.htm')]:
|
||||
articles = self.parse_section(url)
|
||||
if articles:
|
||||
feeds.append((title, articles))
|
||||
|
||||
# special- editorial
|
||||
ed_articles = self.parse_ed_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalmr')
|
||||
if ed_articles:
|
||||
feeds.append((u'\u793e\u8a55/\u7b46\u9663 Editorial', ed_articles))
|
||||
#ed_articles = self.parse_ed_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalmr')
|
||||
#if ed_articles:
|
||||
# feeds.append((u'\u793e\u8a55/\u7b46\u9663 Editorial', ed_articles))
|
||||
|
||||
for title, url in [(u'\u8ad6\u58c7 Forum', 'http://news.mingpao.com/' + dateStr + '/faindex.htm'),
|
||||
(u'\u4e2d\u570b China', 'http://news.mingpao.com/' + dateStr + '/caindex.htm'),
|
||||
@ -263,20 +296,39 @@ class MPRecipe(BasicNewsRecipe):
|
||||
|
||||
# special - finance
|
||||
#fin_articles = self.parse_fin_section('http://www.mpfinance.com/htm/Finance/' + dateStr + '/News/ea,eb,ecindex.htm')
|
||||
fin_articles = self.parse_fin_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalea')
|
||||
if fin_articles:
|
||||
feeds.append((u'\u7d93\u6fdf Finance', fin_articles))
|
||||
#fin_articles = self.parse_fin_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalea')
|
||||
#if fin_articles:
|
||||
# feeds.append((u'\u7d93\u6fdf Finance', fin_articles))
|
||||
|
||||
for title, url in [('Tech News', 'http://news.mingpao.com/' + dateStr + '/naindex.htm'),
|
||||
(u'\u9ad4\u80b2 Sport', 'http://news.mingpao.com/' + dateStr + '/spindex.htm')]:
|
||||
articles = self.parse_section(url)
|
||||
for title, url, keystr in [(u'\u7d93\u6fdf Finance', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalea', 'nal')]:
|
||||
articles = self.parse_section2_txt(url, keystr)
|
||||
if articles:
|
||||
feeds.append((title, articles))
|
||||
|
||||
#for title, url in [('Tech News', 'http://news.mingpao.com/' + dateStr + '/naindex.htm'),
|
||||
# (u'\u9ad4\u80b2 Sport', 'http://news.mingpao.com/' + dateStr + '/spindex.htm')]:
|
||||
# articles = self.parse_section(url)
|
||||
# if articles:
|
||||
# feeds.append((title, articles))
|
||||
|
||||
# special - entertainment
|
||||
ent_articles = self.parse_ent_section('http://ol.mingpao.com/cfm/star1.cfm')
|
||||
if ent_articles:
|
||||
feeds.append((u'\u5f71\u8996 Film/TV', ent_articles))
|
||||
#ent_articles = self.parse_ent_section('http://ol.mingpao.com/cfm/star1.cfm')
|
||||
#if ent_articles:
|
||||
# feeds.append((u'\u5f71\u8996 Film/TV', ent_articles))
|
||||
|
||||
for title, url, keystr in [(u'\u5f71\u8996 Film/TV', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalma', 'nal')
|
||||
]:
|
||||
articles = self.parse_section2_txt(url, keystr)
|
||||
if articles:
|
||||
feeds.append((title, articles))
|
||||
|
||||
if __InclPremium__ == True:
|
||||
# parse column section articles directly from .txt files
|
||||
for title, url, keystr in [(u'\u5c08\u6b04 Columns', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=ncolumn', 'ncl')
|
||||
]:
|
||||
articles = self.parse_section2_txt(url, keystr)
|
||||
if articles:
|
||||
feeds.append((title, articles))
|
||||
|
||||
for title, url in [(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'),
|
||||
(u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]:
|
||||
@ -284,11 +336,6 @@ class MPRecipe(BasicNewsRecipe):
|
||||
if articles:
|
||||
feeds.append((title, articles))
|
||||
|
||||
|
||||
# special- columns
|
||||
col_articles = self.parse_col_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=ncolumn')
|
||||
if col_articles:
|
||||
feeds.append((u'\u5c08\u6b04 Columns', col_articles))
|
||||
elif __Region__ == 'Vancouver':
|
||||
for title, url in [(u'\u8981\u805e Headline', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/VAindex.htm'),
|
||||
(u'\u52a0\u570b Canada', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/VBindex.htm'),
|
||||
@ -332,6 +379,16 @@ class MPRecipe(BasicNewsRecipe):
|
||||
title = self.tag_to_string(a)
|
||||
url = a.get('href', False)
|
||||
url = 'http://news.mingpao.com/' + dateStr + '/' +url
|
||||
# replace the url to the print-friendly version
|
||||
if __ParsePFF__ == True:
|
||||
if url.rfind('Redirect') <> -1 and __InclPremium__ == True:
|
||||
url = re.sub(dateStr + '.*' + dateStr, dateStr, url)
|
||||
url = re.sub('%2F.*%2F', '/', url)
|
||||
title = title.replace(u'\u6536\u8cbb\u5167\u5bb9', '')
|
||||
url = url.replace('%2Etxt', '_print.htm')
|
||||
url = url.replace('%5F', '_')
|
||||
else:
|
||||
url = url.replace('.htm', '_print.htm')
|
||||
if url not in included_urls and url.rfind('Redirect') == -1:
|
||||
current_articles.append({'title': title, 'url': url, 'description':'', 'date':''})
|
||||
included_urls.append(url)
|
||||
@ -340,6 +397,8 @@ class MPRecipe(BasicNewsRecipe):
|
||||
|
||||
# parse from life.mingpao.com
|
||||
def parse_section2(self, url, keystr):
|
||||
br = mechanize.Browser()
|
||||
br.set_handle_redirect(False)
|
||||
self.get_fetchdate()
|
||||
soup = self.index_to_soup(url)
|
||||
a = soup.findAll('a', href=True)
|
||||
@ -350,7 +409,29 @@ class MPRecipe(BasicNewsRecipe):
|
||||
title = self.tag_to_string(i)
|
||||
url = 'http://life.mingpao.com/cfm/' + i.get('href', False)
|
||||
if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind(keystr) == -1):
|
||||
url = url.replace('dailynews3.cfm', 'dailynews3a.cfm') # use printed version of the article
|
||||
try:
|
||||
br.open_novisit(url)
|
||||
url = url.replace('dailynews3.cfm', 'dailynews3a.cfm') # use printed version of the article
|
||||
current_articles.append({'title': title, 'url': url, 'description': ''})
|
||||
included_urls.append(url)
|
||||
except:
|
||||
print 'skipping a premium article'
|
||||
current_articles.reverse()
|
||||
return current_articles
|
||||
|
||||
# parse from text file of life.mingpao.com
|
||||
def parse_section2_txt(self, url, keystr):
|
||||
self.get_fetchdate()
|
||||
soup = self.index_to_soup(url)
|
||||
a = soup.findAll('a', href=True)
|
||||
a.reverse()
|
||||
current_articles = []
|
||||
included_urls = []
|
||||
for i in a:
|
||||
title = self.tag_to_string(i)
|
||||
url = 'http://life.mingpao.com/cfm/' + i.get('href', False)
|
||||
if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind(keystr) == -1):
|
||||
url = url.replace('cfm/dailynews3.cfm?File=', 'ftp/Life3/') # use printed version of the article
|
||||
current_articles.append({'title': title, 'url': url, 'description': ''})
|
||||
included_urls.append(url)
|
||||
current_articles.reverse()
|
||||
@ -438,6 +519,162 @@ class MPRecipe(BasicNewsRecipe):
|
||||
current_articles.reverse()
|
||||
return current_articles
|
||||
|
||||
# preprocess those .txt and javascript based files
|
||||
def preprocess_raw_html(self, raw_html, url):
|
||||
new_html = raw_html
|
||||
if url.rfind('ftp') <> -1 or url.rfind('_print.htm') <> -1:
|
||||
if url.rfind('_print.htm') <> -1:
|
||||
# javascript based file
|
||||
splitter = re.compile(r'\n')
|
||||
new_raw_html = '<html><head><title>Untitled</title></head>'
|
||||
new_raw_html = new_raw_html + '<body>'
|
||||
for item in splitter.split(raw_html):
|
||||
if item.startswith('var heading1 ='):
|
||||
heading = item.replace('var heading1 = \'', '')
|
||||
heading = heading.replace('\'', '')
|
||||
heading = heading.replace(';', '')
|
||||
new_raw_html = new_raw_html + '<div class="heading">' + heading
|
||||
if item.startswith('var heading2 ='):
|
||||
heading = item.replace('var heading2 = \'', '')
|
||||
heading = heading.replace('\'', '')
|
||||
heading = heading.replace(';', '')
|
||||
if heading <> '':
|
||||
new_raw_html = new_raw_html + '<br>' + heading + '</div>'
|
||||
else:
|
||||
new_raw_html = new_raw_html + '</div>'
|
||||
if item.startswith('var content ='):
|
||||
content = item.replace("var content = ", '')
|
||||
content = content.replace('\'', '')
|
||||
content = content.replace(';', '')
|
||||
new_raw_html = new_raw_html + '<div class="content">' + content + '</div>'
|
||||
if item.startswith('var photocontent ='):
|
||||
photo = item.replace('var photocontent = \'', '')
|
||||
photo = photo.replace('\'', '')
|
||||
photo = photo.replace(';', '')
|
||||
photo = photo.replace('<tr>', '')
|
||||
photo = photo.replace('<td>', '')
|
||||
photo = photo.replace('</tr>', '')
|
||||
photo = photo.replace('</td>', '<br>')
|
||||
photo = photo.replace('class="photo"', '')
|
||||
new_raw_html = new_raw_html + '<div class="images">' + photo + '</div>'
|
||||
new_html = new_raw_html + '</body></html>'
|
||||
else:
|
||||
# .txt based file
|
||||
splitter = re.compile(r'\n') # Match non-digits
|
||||
new_raw_html = '<html><head><title>Untitled</title></head><body><div class="images">'
|
||||
next_is_img_txt = False
|
||||
title_started = False
|
||||
title_break_reached = False
|
||||
met_article_start_char = False
|
||||
for item in splitter.split(raw_html):
|
||||
item = item.strip()
|
||||
# if title already reached but break between title and content not yet found, record title_break_reached
|
||||
if title_started == True and title_break_reached == False and item == '':
|
||||
title_break_reached = True
|
||||
# if title reached and title_break_reached and met_article_start_char == False and item is not empty
|
||||
# start content
|
||||
elif title_started == True and title_break_reached == True and met_article_start_char == False:
|
||||
if item <> '':
|
||||
met_article_start_char = True
|
||||
new_raw_html = new_raw_html + '</div><div class="content"><p>' + item + '<p>\n'
|
||||
#if item.startswith(u'\u3010'):
|
||||
# met_article_start_char = True
|
||||
# new_raw_html = new_raw_html + '</div><div class="content"><p>' + item + '<p>\n'
|
||||
else:
|
||||
if next_is_img_txt == False:
|
||||
if item.startswith("=@"):
|
||||
print 'skip movie link'
|
||||
elif item.startswith("=?"):
|
||||
next_is_img_txt = True
|
||||
new_raw_html += '<img src="' + str(item)[2:].strip() + '.gif" /><p>\n'
|
||||
elif item.startswith('=='):
|
||||
next_is_img_txt = True
|
||||
if False:
|
||||
# TODO: check existence of .gif first
|
||||
newimg = '_' + item[2:].strip() + '.jpg'
|
||||
new_raw_html += '<img src="' + newimg + '" /><p>\n'
|
||||
else:
|
||||
new_raw_html += '<img src="' + str(item)[2:].strip() + '.jpg" /><p>\n'
|
||||
elif item.startswith('='):
|
||||
next_is_img_txt = True
|
||||
if False:
|
||||
# TODO: check existence of .gif first
|
||||
newimg = '_' + item[1:].strip() + '.jpg'
|
||||
new_raw_html += '<img src="' + newimg + '" /><p>\n'
|
||||
else:
|
||||
new_raw_html += '<img src="' + str(item)[1:].strip() + '.jpg" /><p>\n'
|
||||
else:
|
||||
if next_is_img_txt == False and met_article_start_char == False:
|
||||
if item <> '':
|
||||
if title_started == False:
|
||||
#print 'Title started at ', item
|
||||
new_raw_html = new_raw_html + '</div><div class="heading">' + item + '\n'
|
||||
title_started = True
|
||||
else:
|
||||
new_raw_html = new_raw_html + item + '\n'
|
||||
else:
|
||||
new_raw_html = new_raw_html + item + '<p>\n'
|
||||
else:
|
||||
next_is_img_txt = False
|
||||
new_raw_html = new_raw_html + item + '\n'
|
||||
new_html = new_raw_html + '</div></body></html>'
|
||||
#raw_html = raw_html.replace(u'<p>\u3010', u'\u3010')
|
||||
if __HiResImg__ == True:
|
||||
# TODO: add a _ in front of an image url
|
||||
if url.rfind('news.mingpao.com') > -1:
|
||||
imglist = re.findall('src="?.*?jpg"', new_html)
|
||||
br = mechanize.Browser()
|
||||
br.set_handle_redirect(False)
|
||||
for img in imglist:
|
||||
gifimg = img.replace('jpg"', 'gif"')
|
||||
try:
|
||||
br.open_novisit(url + "/../" + gifimg[5:len(gifimg)-1])
|
||||
new_html = new_html.replace(img, gifimg)
|
||||
except:
|
||||
# find the location of the first _
|
||||
pos = img.find('_')
|
||||
if pos > -1:
|
||||
# if found, insert _ after the first _
|
||||
newimg = img[0:pos] + '_' + img[pos:]
|
||||
new_html = new_html.replace(img, newimg)
|
||||
else:
|
||||
# if not found, insert _ after "
|
||||
new_html = new_html.replace(img[1:], '"_' + img[1:])
|
||||
elif url.rfind('life.mingpao.com') > -1:
|
||||
imglist = re.findall('src=\'?.*?jpg\'', new_html)
|
||||
br = mechanize.Browser()
|
||||
br.set_handle_redirect(False)
|
||||
#print 'Img list: ', imglist, '\n'
|
||||
for img in imglist:
|
||||
#print 'Found img: ', img
|
||||
gifimg = img.replace('jpg\'', 'gif\'')
|
||||
try:
|
||||
gifurl = re.sub(r'dailynews.*txt', '', url)
|
||||
br.open_novisit(gifurl + gifimg[5:len(gifimg)-1])
|
||||
new_html = new_html.replace(img, gifimg)
|
||||
except:
|
||||
pos = img.rfind('/')
|
||||
newimg = img[0:pos+1] + '_' + img[pos+1:]
|
||||
new_html = new_html.replace(img, newimg)
|
||||
# repeat with src quoted by double quotes, for text parsed from src txt
|
||||
imglist = re.findall('src="?.*?jpg"', new_html)
|
||||
for img in imglist:
|
||||
#print 'Found img: ', img
|
||||
gifimg = img.replace('jpg"', 'gif"')
|
||||
try:
|
||||
#print 'url', url
|
||||
pos = url.rfind('/')
|
||||
gifurl = url[:pos+1]
|
||||
#print 'try it:', gifurl + gifimg[5:len(gifimg)-1]
|
||||
br.open_novisit(gifurl + gifimg[5:len(gifimg)-1])
|
||||
new_html = new_html.replace(img, gifimg)
|
||||
except:
|
||||
pos = img.find('"')
|
||||
newimg = img[0:pos+1] + '_' + img[pos+1:]
|
||||
#print 'Use hi-res img', newimg
|
||||
new_html = new_html.replace(img, newimg)
|
||||
return new_html
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
for item in soup.findAll(style=True):
|
||||
del item['style']
|
||||
@ -447,77 +684,153 @@ class MPRecipe(BasicNewsRecipe):
|
||||
del item['absmiddle']
|
||||
return soup
|
||||
|
||||
def populate_article_metadata(self, article, soup, first):
|
||||
# thumbnails shouldn't be available if using hi-res images
|
||||
if __IncludeThumbnails__ and __HiResImg__ == False and first and hasattr(self, 'add_toc_thumbnail'):
|
||||
img = soup.find('img')
|
||||
if img is not None:
|
||||
self.add_toc_thumbnail(article, img['src'])
|
||||
|
||||
try:
|
||||
if __IncludeSummary__ and len(article.text_summary.strip()) == 0:
|
||||
# look for content
|
||||
articlebodies = soup.findAll('div',attrs={'id':'newscontent'})
|
||||
if not articlebodies:
|
||||
articlebodies = soup.findAll('div',attrs={'id':'newscontent01'})
|
||||
if not articlebodies:
|
||||
articlebodies = soup.findAll('div',attrs={'class':'content'})
|
||||
if not articlebodies:
|
||||
articlebodies = soup.findAll('div', attrs={'id':'font'})
|
||||
if articlebodies:
|
||||
for articlebody in articlebodies:
|
||||
if articlebody:
|
||||
# the text may or may not be enclosed in <p></p> tag
|
||||
paras = articlebody.findAll('p')
|
||||
if not paras:
|
||||
paras = articlebody
|
||||
textFound = False
|
||||
for p in paras:
|
||||
if not textFound:
|
||||
summary_candidate = self.tag_to_string(p).strip()
|
||||
summary_candidate = summary_candidate.replace(u'\u3010\u660e\u5831\u5c08\u8a0a\u3011', '', 1)
|
||||
if len(summary_candidate) > 0:
|
||||
article.summary = article.text_summary = summary_candidate
|
||||
textFound = True
|
||||
else:
|
||||
# display a simple text
|
||||
#article.summary = article.text_summary = u'\u66f4\u591a......'
|
||||
# display word counts
|
||||
counts = 0
|
||||
articlebodies = soup.findAll('div',attrs={'id':'newscontent'})
|
||||
if not articlebodies:
|
||||
articlebodies = soup.findAll('div',attrs={'id':'newscontent01'})
|
||||
if not articlebodies:
|
||||
articlebodies = soup.findAll('div',attrs={'class':'content'})
|
||||
if not articlebodies:
|
||||
articlebodies = soup.findAll('div', attrs={'id':'font'})
|
||||
if articlebodies:
|
||||
for articlebody in articlebodies:
|
||||
# the text may or may not be enclosed in <p></p> tag
|
||||
paras = articlebody.findAll('p')
|
||||
if not paras:
|
||||
paras = articlebody
|
||||
for p in paras:
|
||||
summary_candidate = self.tag_to_string(p).strip()
|
||||
counts += len(summary_candidate)
|
||||
article.summary = article.text_summary = u'\uff08' + str(counts) + u'\u5b57\uff09'
|
||||
except:
|
||||
self.log("Error creating article descriptions")
|
||||
return
|
||||
|
||||
# override from the one in version 0.8.31
|
||||
def create_opf(self, feeds, dir=None):
|
||||
if dir is None:
|
||||
dir = self.output_dir
|
||||
if __UseChineseTitle__ == True:
|
||||
if __Region__ == 'Hong Kong':
|
||||
title = u'\u660e\u5831 (\u9999\u6e2f)'
|
||||
elif __Region__ == 'Vancouver':
|
||||
title = u'\u660e\u5831 (\u6eab\u54e5\u83ef)'
|
||||
elif __Region__ == 'Toronto':
|
||||
title = u'\u660e\u5831 (\u591a\u502b\u591a)'
|
||||
else:
|
||||
title = self.short_title()
|
||||
# if not generating a periodical, force date to apply in title
|
||||
if __MakePeriodical__ == False:
|
||||
title = self.short_title()
|
||||
# change 1: allow our own flag to tell if a periodical is to be generated
|
||||
# also use customed date instead of current time
|
||||
if __MakePeriodical__ == False or self.output_profile.periodical_date_in_title:
|
||||
title = title + ' ' + self.get_fetchformatteddate()
|
||||
if True:
|
||||
mi = MetaInformation(title, [self.publisher])
|
||||
mi.publisher = self.publisher
|
||||
mi.author_sort = self.publisher
|
||||
if __MakePeriodical__ == True:
|
||||
mi.publication_type = 'periodical:'+self.publication_type+':'+self.short_title()
|
||||
else:
|
||||
mi.publication_type = self.publication_type+':'+self.short_title()
|
||||
#mi.timestamp = nowf()
|
||||
mi.timestamp = self.get_dtlocal()
|
||||
mi.comments = self.description
|
||||
if not isinstance(mi.comments, unicode):
|
||||
mi.comments = mi.comments.decode('utf-8', 'replace')
|
||||
#mi.pubdate = nowf()
|
||||
mi.pubdate = self.get_dtlocal()
|
||||
opf_path = os.path.join(dir, 'index.opf')
|
||||
ncx_path = os.path.join(dir, 'index.ncx')
|
||||
opf = OPFCreator(dir, mi)
|
||||
# Add mastheadImage entry to <guide> section
|
||||
mp = getattr(self, 'masthead_path', None)
|
||||
if mp is not None and os.access(mp, os.R_OK):
|
||||
from calibre.ebooks.metadata.opf2 import Guide
|
||||
ref = Guide.Reference(os.path.basename(self.masthead_path), os.getcwdu())
|
||||
ref.type = 'masthead'
|
||||
ref.title = 'Masthead Image'
|
||||
opf.guide.append(ref)
|
||||
# end of change 1
|
||||
# change 2: __appname__ replaced by newspaper publisher
|
||||
__appname__ = self.publisher
|
||||
mi = MetaInformation(title, [__appname__])
|
||||
mi.publisher = __appname__
|
||||
mi.author_sort = __appname__
|
||||
# change 3: use __MakePeriodical__ flag to tell if a periodical should be generated
|
||||
if __MakePeriodical__ == True:
|
||||
mi.publication_type = 'periodical:'+self.publication_type+':'+self.short_title()
|
||||
else:
|
||||
mi.publication_type = self.publication_type+':'+self.short_title()
|
||||
#mi.publication_type = 'periodical:'+self.publication_type+':'+self.short_title()
|
||||
# change 4: in the following, all the nowf() are changed to adjusted time
|
||||
# This one doesn't matter
|
||||
mi.timestamp = nowf()
|
||||
# change 5: skip listing the articles
|
||||
#article_titles, aseen = [], set()
|
||||
#for f in feeds:
|
||||
# for a in f:
|
||||
# if a.title and a.title not in aseen:
|
||||
# aseen.add(a.title)
|
||||
# article_titles.append(force_unicode(a.title, 'utf-8'))
|
||||
|
||||
manifest = [os.path.join(dir, 'feed_%d'%i) for i in range(len(feeds))]
|
||||
manifest.append(os.path.join(dir, 'index.html'))
|
||||
manifest.append(os.path.join(dir, 'index.ncx'))
|
||||
#mi.comments = self.description
|
||||
#if not isinstance(mi.comments, unicode):
|
||||
# mi.comments = mi.comments.decode('utf-8', 'replace')
|
||||
#mi.comments += ('\n\n' + _('Articles in this issue: ') + '\n' +
|
||||
# '\n\n'.join(article_titles))
|
||||
|
||||
# Get cover
|
||||
cpath = getattr(self, 'cover_path', None)
|
||||
if cpath is None:
|
||||
pf = open(os.path.join(dir, 'cover.jpg'), 'wb')
|
||||
if self.default_cover(pf):
|
||||
cpath = pf.name
|
||||
if cpath is not None and os.access(cpath, os.R_OK):
|
||||
opf.cover = cpath
|
||||
manifest.append(cpath)
|
||||
language = canonicalize_lang(self.language)
|
||||
if language is not None:
|
||||
mi.language = language
|
||||
# This one affects the pub date shown in kindle title
|
||||
#mi.pubdate = nowf()
|
||||
# now appears to need the time field to be > 12.00noon as well
|
||||
mi.pubdate = datetime.datetime(int(self.get_fetchyear()), int(self.get_fetchmonth()), int(self.get_fetchday()), 12, 30, 0)
|
||||
opf_path = os.path.join(dir, 'index.opf')
|
||||
ncx_path = os.path.join(dir, 'index.ncx')
|
||||
|
||||
# Get masthead
|
||||
mpath = getattr(self, 'masthead_path', None)
|
||||
if mpath is not None and os.access(mpath, os.R_OK):
|
||||
manifest.append(mpath)
|
||||
opf = OPFCreator(dir, mi)
|
||||
# Add mastheadImage entry to <guide> section
|
||||
mp = getattr(self, 'masthead_path', None)
|
||||
if mp is not None and os.access(mp, os.R_OK):
|
||||
from calibre.ebooks.metadata.opf2 import Guide
|
||||
ref = Guide.Reference(os.path.basename(self.masthead_path), os.getcwdu())
|
||||
ref.type = 'masthead'
|
||||
ref.title = 'Masthead Image'
|
||||
opf.guide.append(ref)
|
||||
|
||||
manifest = [os.path.join(dir, 'feed_%d'%i) for i in range(len(feeds))]
|
||||
manifest.append(os.path.join(dir, 'index.html'))
|
||||
manifest.append(os.path.join(dir, 'index.ncx'))
|
||||
|
||||
# Get cover
|
||||
cpath = getattr(self, 'cover_path', None)
|
||||
if cpath is None:
|
||||
pf = open(os.path.join(dir, 'cover.jpg'), 'wb')
|
||||
if self.default_cover(pf):
|
||||
cpath = pf.name
|
||||
if cpath is not None and os.access(cpath, os.R_OK):
|
||||
opf.cover = cpath
|
||||
manifest.append(cpath)
|
||||
|
||||
# Get masthead
|
||||
mpath = getattr(self, 'masthead_path', None)
|
||||
if mpath is not None and os.access(mpath, os.R_OK):
|
||||
manifest.append(mpath)
|
||||
|
||||
opf.create_manifest_from_files_in(manifest)
|
||||
for mani in opf.manifest:
|
||||
if mani.path.endswith('.ncx'):
|
||||
mani.id = 'ncx'
|
||||
if mani.path.endswith('mastheadImage.jpg'):
|
||||
mani.id = 'masthead-image'
|
||||
|
||||
entries = ['index.html']
|
||||
toc = TOC(base_path=dir)
|
||||
self.play_order_counter = 0
|
||||
self.play_order_map = {}
|
||||
|
||||
opf.create_manifest_from_files_in(manifest)
|
||||
for mani in opf.manifest:
|
||||
if mani.path.endswith('.ncx'):
|
||||
mani.id = 'ncx'
|
||||
if mani.path.endswith('mastheadImage.jpg'):
|
||||
mani.id = 'masthead-image'
|
||||
entries = ['index.html']
|
||||
toc = TOC(base_path=dir)
|
||||
self.play_order_counter = 0
|
||||
self.play_order_map = {}
|
||||
|
||||
def feed_index(num, parent):
|
||||
f = feeds[num]
|
||||
@ -532,13 +845,16 @@ class MPRecipe(BasicNewsRecipe):
|
||||
desc = None
|
||||
else:
|
||||
desc = self.description_limiter(desc)
|
||||
tt = a.toc_thumbnail if a.toc_thumbnail else None
|
||||
entries.append('%sindex.html'%adir)
|
||||
po = self.play_order_map.get(entries[-1], None)
|
||||
if po is None:
|
||||
self.play_order_counter += 1
|
||||
po = self.play_order_counter
|
||||
parent.add_item('%sindex.html'%adir, None, a.title if a.title else _('Untitled Article'),
|
||||
play_order=po, author=auth, description=desc)
|
||||
parent.add_item('%sindex.html'%adir, None,
|
||||
a.title if a.title else _('Untitled Article'),
|
||||
play_order=po, author=auth,
|
||||
description=desc, toc_thumbnail=tt)
|
||||
last = os.path.join(self.output_dir, ('%sindex.html'%adir).replace('/', os.sep))
|
||||
for sp in a.sub_pages:
|
||||
prefix = os.path.commonprefix([opf_path, sp])
|
||||
@ -555,7 +871,7 @@ class MPRecipe(BasicNewsRecipe):
|
||||
prefix = '/'.join('..'for i in range(2*len(re.findall(r'link\d+', last))))
|
||||
templ = self.navbar.generate(True, num, j, len(f),
|
||||
not self.has_single_feed,
|
||||
a.orig_url, self.publisher, prefix=prefix,
|
||||
a.orig_url, __appname__, prefix=prefix,
|
||||
center=self.center_navbar)
|
||||
elem = BeautifulSoup(templ.render(doctype='xhtml').decode('utf-8')).find('div')
|
||||
body.insert(len(body.contents), elem)
|
||||
@ -578,7 +894,7 @@ class MPRecipe(BasicNewsRecipe):
|
||||
if not desc:
|
||||
desc = None
|
||||
feed_index(i, toc.add_item('feed_%d/index.html'%i, None,
|
||||
f.title, play_order=po, description=desc, author=auth))
|
||||
f.title, play_order=po, description=desc, author=auth))
|
||||
|
||||
else:
|
||||
entries.append('feed_%d/index.html'%0)
|
||||
@ -592,3 +908,4 @@ class MPRecipe(BasicNewsRecipe):
|
||||
with nested(open(opf_path, 'wb'), open(ncx_path, 'wb')) as (opf_file, ncx_file):
|
||||
opf.render(opf_file, ncx_file)
|
||||
|
||||
|
||||
|
@ -4,18 +4,41 @@ __copyright__ = '2010-2011, Eddie Lau'
|
||||
# Region - Hong Kong, Vancouver, Toronto
|
||||
__Region__ = 'Vancouver'
|
||||
# Users of Kindle 3 with limited system-level CJK support
|
||||
# please replace the following "True" with "False".
|
||||
# please replace the following "True" with "False". (Default: True)
|
||||
__MakePeriodical__ = True
|
||||
# Turn below to true if your device supports display of CJK titles
|
||||
# Turn below to True if your device supports display of CJK titles (Default: False)
|
||||
__UseChineseTitle__ = False
|
||||
# Set it to False if you want to skip images
|
||||
# Set it to False if you want to skip images (Default: True)
|
||||
__KeepImages__ = True
|
||||
# (HK only) Turn below to true if you wish to use life.mingpao.com as the main article source
|
||||
# Set it to True if you want to include a summary in Kindle's article view (Default: False)
|
||||
__IncludeSummary__ = False
|
||||
# Set it to True if you want thumbnail images in Kindle's article view (Default: True)
|
||||
__IncludeThumbnails__ = True
|
||||
# (HK only) Turn below to True if you wish to use life.mingpao.com as the main article source (Default: True)
|
||||
__UseLife__ = True
|
||||
# (HK only) It is to disable premium content (Default: False)
|
||||
__InclPremium__ = False
|
||||
# (HK only) Turn below to True if you wish to parse articles in news.mingpao.com with their printer-friendly formats (Default: True)
|
||||
__ParsePFF__ = True
|
||||
# (HK only) Turn below to True if you wish hi-res images (Default: False)
|
||||
__HiResImg__ = False
|
||||
# Override the date returned by the program if specifying a YYYYMMDD below
|
||||
__Date__ = ''
|
||||
|
||||
|
||||
'''
|
||||
Change Log:
|
||||
2011/12/18: update the overridden create_odf(.) routine with the one from Calibre version 0.8.31. Move __UseChineseTitle__ usage away
|
||||
from create_odf(.). Optional support of text_summary and thumbnail images in Kindle's article view. Start new day
|
||||
download of Hong Kong Mingpao at 4.30am. Set the actual publication date shown on kindle device.
|
||||
2011/12/01: take care of situation that in txt source parsing, the article content does start with special character u'\u3010'
|
||||
2011/10/21: fix a bug that hi-res img is unavailable in pages parsed from source txt
|
||||
2011/10/19: fix a bug in txt source parsing
|
||||
2011/10/17: disable fetching of premium content, also improved txt source parsing
|
||||
2011/10/04: option to get hi-res photos for the articles
|
||||
2011/09/21: fetching "column" section is made optional.
|
||||
2011/09/18: parse "column" section stuff from source text file directly.
|
||||
2011/09/07: disable "column" section as it is no longer offered free.
|
||||
2011/06/26: add fetching Vancouver and Toronto versions of the paper, also provide captions for images using life.mingpao fetch source
|
||||
provide options to remove all images in the file
|
||||
2011/05/12: switch the main parse source to life.mingpao.com, which has more photos on the article pages
|
||||
@ -37,30 +60,38 @@ Change Log:
|
||||
2010/10/31: skip repeated articles in section pages
|
||||
'''
|
||||
|
||||
import os, datetime, re
|
||||
from calibre.utils.date import now as nowf
|
||||
import os, datetime, re, mechanize
|
||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||
from contextlib import nested
|
||||
from calibre.ebooks.BeautifulSoup import BeautifulSoup
|
||||
from calibre.ebooks.metadata.opf2 import OPFCreator
|
||||
from calibre.ebooks.metadata.toc import TOC
|
||||
from calibre.ebooks.metadata import MetaInformation
|
||||
from calibre.utils.localization import canonicalize_lang
|
||||
|
||||
# MAIN CLASS
|
||||
class MPRecipe(BasicNewsRecipe):
|
||||
if __Region__ == 'Hong Kong':
|
||||
title = 'Ming Pao - Hong Kong'
|
||||
if __UseChineseTitle__ == True:
|
||||
title = u'\u660e\u5831 (\u9999\u6e2f)'
|
||||
else:
|
||||
title = 'Ming Pao - Hong Kong'
|
||||
description = 'Hong Kong Chinese Newspaper (http://news.mingpao.com)'
|
||||
category = 'Chinese, News, Hong Kong'
|
||||
extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px;} font>b {font-size:200%; font-weight:bold;}'
|
||||
extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px; max-height:90%;} font>b {font-size:200%; font-weight:bold;} div[class=heading] {font-size:200%; font-weight:bold;} div[class=images] {font-size:50%;}'
|
||||
masthead_url = 'http://news.mingpao.com/image/portals_top_logo_news.gif'
|
||||
keep_only_tags = [dict(name='h1'),
|
||||
dict(name='font', attrs={'style':['font-size:14pt; line-height:160%;']}), # for entertainment page title
|
||||
dict(name='font', attrs={'color':['AA0000']}), # for column articles title
|
||||
dict(attrs={'class':['heading']}), # for heading from txt
|
||||
dict(attrs={'id':['newscontent']}), # entertainment and column page content
|
||||
dict(attrs={'id':['newscontent01','newscontent02']}),
|
||||
dict(attrs={'class':['content']}), # for content from txt
|
||||
dict(attrs={'class':['photo']}),
|
||||
dict(name='table', attrs={'width':['100%'], 'border':['0'], 'cellspacing':['5'], 'cellpadding':['0']}), # content in printed version of life.mingpao.com
|
||||
dict(name='img', attrs={'width':['180'], 'alt':['按圖放大']}) # images for source from life.mingpao.com
|
||||
dict(name='img', attrs={'width':['180'], 'alt':['????']}), # images for source from life.mingpao.com
|
||||
dict(attrs={'class':['images']}) # for images from txt
|
||||
]
|
||||
if __KeepImages__:
|
||||
remove_tags = [dict(name='style'),
|
||||
@ -90,7 +121,10 @@ class MPRecipe(BasicNewsRecipe):
|
||||
lambda match: "</b>")
|
||||
]
|
||||
elif __Region__ == 'Vancouver':
|
||||
title = 'Ming Pao - Vancouver'
|
||||
if __UseChineseTitle__ == True:
|
||||
title = u'\u660e\u5831 (\u6eab\u54e5\u83ef)'
|
||||
else:
|
||||
title = 'Ming Pao - Vancouver'
|
||||
description = 'Vancouver Chinese Newspaper (http://www.mingpaovan.com)'
|
||||
category = 'Chinese, News, Vancouver'
|
||||
extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px;} b>font {font-size:200%; font-weight:bold;}'
|
||||
@ -108,7 +142,10 @@ class MPRecipe(BasicNewsRecipe):
|
||||
lambda match: ''),
|
||||
]
|
||||
elif __Region__ == 'Toronto':
|
||||
title = 'Ming Pao - Toronto'
|
||||
if __UseChineseTitle__ == True:
|
||||
title = u'\u660e\u5831 (\u591a\u502b\u591a)'
|
||||
else:
|
||||
title = 'Ming Pao - Toronto'
|
||||
description = 'Toronto Chinese Newspaper (http://www.mingpaotor.com)'
|
||||
category = 'Chinese, News, Toronto'
|
||||
extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px;} b>font {font-size:200%; font-weight:bold;}'
|
||||
@ -139,49 +176,12 @@ class MPRecipe(BasicNewsRecipe):
|
||||
conversion_options = {'linearize_tables':True}
|
||||
timefmt = ''
|
||||
|
||||
def image_url_processor(cls, baseurl, url):
|
||||
# trick: break the url at the first occurance of digit, add an additional
|
||||
# '_' at the front
|
||||
# not working, may need to move this to preprocess_html() method
|
||||
# minIdx = 10000
|
||||
# i0 = url.find('0')
|
||||
# if i0 >= 0 and i0 < minIdx:
|
||||
# minIdx = i0
|
||||
# i1 = url.find('1')
|
||||
# if i1 >= 0 and i1 < minIdx:
|
||||
# minIdx = i1
|
||||
# i2 = url.find('2')
|
||||
# if i2 >= 0 and i2 < minIdx:
|
||||
# minIdx = i2
|
||||
# i3 = url.find('3')
|
||||
# if i3 >= 0 and i0 < minIdx:
|
||||
# minIdx = i3
|
||||
# i4 = url.find('4')
|
||||
# if i4 >= 0 and i4 < minIdx:
|
||||
# minIdx = i4
|
||||
# i5 = url.find('5')
|
||||
# if i5 >= 0 and i5 < minIdx:
|
||||
# minIdx = i5
|
||||
# i6 = url.find('6')
|
||||
# if i6 >= 0 and i6 < minIdx:
|
||||
# minIdx = i6
|
||||
# i7 = url.find('7')
|
||||
# if i7 >= 0 and i7 < minIdx:
|
||||
# minIdx = i7
|
||||
# i8 = url.find('8')
|
||||
# if i8 >= 0 and i8 < minIdx:
|
||||
# minIdx = i8
|
||||
# i9 = url.find('9')
|
||||
# if i9 >= 0 and i9 < minIdx:
|
||||
# minIdx = i9
|
||||
return url
|
||||
|
||||
def get_dtlocal(self):
|
||||
dt_utc = datetime.datetime.utcnow()
|
||||
if __Region__ == 'Hong Kong':
|
||||
# convert UTC to local hk time - at HKT 5.30am, all news are available
|
||||
dt_local = dt_utc + datetime.timedelta(8.0/24) - datetime.timedelta(5.5/24)
|
||||
# dt_local = dt_utc.astimezone(pytz.timezone('Asia/Hong_Kong')) - datetime.timedelta(5.5/24)
|
||||
# convert UTC to local hk time - at HKT 4.30am, all news are available
|
||||
dt_local = dt_utc + datetime.timedelta(8.0/24) - datetime.timedelta(4.5/24)
|
||||
# dt_local = dt_utc.astimezone(pytz.timezone('Asia/Hong_Kong')) - datetime.timedelta(4.5/24)
|
||||
elif __Region__ == 'Vancouver':
|
||||
# convert UTC to local Vancouver time - at PST time 5.30am, all news are available
|
||||
dt_local = dt_utc + datetime.timedelta(-8.0/24) - datetime.timedelta(5.5/24)
|
||||
@ -193,13 +193,34 @@ class MPRecipe(BasicNewsRecipe):
|
||||
return dt_local
|
||||
|
||||
def get_fetchdate(self):
|
||||
return self.get_dtlocal().strftime("%Y%m%d")
|
||||
if __Date__ <> '':
|
||||
return __Date__
|
||||
else:
|
||||
return self.get_dtlocal().strftime("%Y%m%d")
|
||||
|
||||
def get_fetchformatteddate(self):
|
||||
return self.get_dtlocal().strftime("%Y-%m-%d")
|
||||
if __Date__ <> '':
|
||||
return __Date__[0:4]+'-'+__Date__[4:6]+'-'+__Date__[6:8]
|
||||
else:
|
||||
return self.get_dtlocal().strftime("%Y-%m-%d")
|
||||
|
||||
def get_fetchyear(self):
|
||||
if __Date__ <> '':
|
||||
return __Date__[0:4]
|
||||
else:
|
||||
return self.get_dtlocal().strftime("%Y")
|
||||
|
||||
def get_fetchmonth(self):
|
||||
if __Date__ <> '':
|
||||
return __Date__[4:6]
|
||||
else:
|
||||
return self.get_dtlocal().strftime("%m")
|
||||
|
||||
def get_fetchday(self):
|
||||
return self.get_dtlocal().strftime("%d")
|
||||
if __Date__ <> '':
|
||||
return __Date__[6:8]
|
||||
else:
|
||||
return self.get_dtlocal().strftime("%d")
|
||||
|
||||
def get_cover_url(self):
|
||||
if __Region__ == 'Hong Kong':
|
||||
@ -230,12 +251,23 @@ class MPRecipe(BasicNewsRecipe):
|
||||
(u'\u570b\u969b World', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalta', 'nal'),
|
||||
(u'\u7d93\u6fdf Finance', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalea', 'nal'),
|
||||
(u'\u9ad4\u80b2 Sport', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalsp', 'nal'),
|
||||
(u'\u5f71\u8996 Film/TV', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalma', 'nal'),
|
||||
(u'\u5c08\u6b04 Columns', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=ncolumn', 'ncl')]:
|
||||
articles = self.parse_section2(url, keystr)
|
||||
(u'\u5f71\u8996 Film/TV', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalma', 'nal')
|
||||
]:
|
||||
if __InclPremium__ == True:
|
||||
articles = self.parse_section2_txt(url, keystr)
|
||||
else:
|
||||
articles = self.parse_section2(url, keystr)
|
||||
if articles:
|
||||
feeds.append((title, articles))
|
||||
|
||||
if __InclPremium__ == True:
|
||||
# parse column section articles directly from .txt files
|
||||
for title, url, keystr in [(u'\u5c08\u6b04 Columns', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=ncolumn', 'ncl')
|
||||
]:
|
||||
articles = self.parse_section2_txt(url, keystr)
|
||||
if articles:
|
||||
feeds.append((title, articles))
|
||||
|
||||
for title, url in [(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'),
|
||||
(u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]:
|
||||
articles = self.parse_section(url)
|
||||
@ -244,15 +276,16 @@ class MPRecipe(BasicNewsRecipe):
|
||||
else:
|
||||
for title, url in [(u'\u8981\u805e Headline', 'http://news.mingpao.com/' + dateStr + '/gaindex.htm'),
|
||||
(u'\u6e2f\u805e Local', 'http://news.mingpao.com/' + dateStr + '/gbindex.htm'),
|
||||
(u'\u6559\u80b2 Education', 'http://news.mingpao.com/' + dateStr + '/gfindex.htm')]:
|
||||
(u'\u6559\u80b2 Education', 'http://news.mingpao.com/' + dateStr + '/gfindex.htm'),
|
||||
(u'\u793e\u8a55/\u7b46\u9663 Editorial', 'http://news.mingpao.com/' + dateStr + '/mrindex.htm')]:
|
||||
articles = self.parse_section(url)
|
||||
if articles:
|
||||
feeds.append((title, articles))
|
||||
|
||||
# special- editorial
|
||||
ed_articles = self.parse_ed_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalmr')
|
||||
if ed_articles:
|
||||
feeds.append((u'\u793e\u8a55/\u7b46\u9663 Editorial', ed_articles))
|
||||
#ed_articles = self.parse_ed_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalmr')
|
||||
#if ed_articles:
|
||||
# feeds.append((u'\u793e\u8a55/\u7b46\u9663 Editorial', ed_articles))
|
||||
|
||||
for title, url in [(u'\u8ad6\u58c7 Forum', 'http://news.mingpao.com/' + dateStr + '/faindex.htm'),
|
||||
(u'\u4e2d\u570b China', 'http://news.mingpao.com/' + dateStr + '/caindex.htm'),
|
||||
@ -263,20 +296,39 @@ class MPRecipe(BasicNewsRecipe):
|
||||
|
||||
# special - finance
|
||||
#fin_articles = self.parse_fin_section('http://www.mpfinance.com/htm/Finance/' + dateStr + '/News/ea,eb,ecindex.htm')
|
||||
fin_articles = self.parse_fin_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalea')
|
||||
if fin_articles:
|
||||
feeds.append((u'\u7d93\u6fdf Finance', fin_articles))
|
||||
#fin_articles = self.parse_fin_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalea')
|
||||
#if fin_articles:
|
||||
# feeds.append((u'\u7d93\u6fdf Finance', fin_articles))
|
||||
|
||||
for title, url in [('Tech News', 'http://news.mingpao.com/' + dateStr + '/naindex.htm'),
|
||||
(u'\u9ad4\u80b2 Sport', 'http://news.mingpao.com/' + dateStr + '/spindex.htm')]:
|
||||
articles = self.parse_section(url)
|
||||
for title, url, keystr in [(u'\u7d93\u6fdf Finance', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalea', 'nal')]:
|
||||
articles = self.parse_section2_txt(url, keystr)
|
||||
if articles:
|
||||
feeds.append((title, articles))
|
||||
|
||||
#for title, url in [('Tech News', 'http://news.mingpao.com/' + dateStr + '/naindex.htm'),
|
||||
# (u'\u9ad4\u80b2 Sport', 'http://news.mingpao.com/' + dateStr + '/spindex.htm')]:
|
||||
# articles = self.parse_section(url)
|
||||
# if articles:
|
||||
# feeds.append((title, articles))
|
||||
|
||||
# special - entertainment
|
||||
ent_articles = self.parse_ent_section('http://ol.mingpao.com/cfm/star1.cfm')
|
||||
if ent_articles:
|
||||
feeds.append((u'\u5f71\u8996 Film/TV', ent_articles))
|
||||
#ent_articles = self.parse_ent_section('http://ol.mingpao.com/cfm/star1.cfm')
|
||||
#if ent_articles:
|
||||
# feeds.append((u'\u5f71\u8996 Film/TV', ent_articles))
|
||||
|
||||
for title, url, keystr in [(u'\u5f71\u8996 Film/TV', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalma', 'nal')
|
||||
]:
|
||||
articles = self.parse_section2_txt(url, keystr)
|
||||
if articles:
|
||||
feeds.append((title, articles))
|
||||
|
||||
if __InclPremium__ == True:
|
||||
# parse column section articles directly from .txt files
|
||||
for title, url, keystr in [(u'\u5c08\u6b04 Columns', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=ncolumn', 'ncl')
|
||||
]:
|
||||
articles = self.parse_section2_txt(url, keystr)
|
||||
if articles:
|
||||
feeds.append((title, articles))
|
||||
|
||||
for title, url in [(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'),
|
||||
(u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]:
|
||||
@ -284,11 +336,6 @@ class MPRecipe(BasicNewsRecipe):
|
||||
if articles:
|
||||
feeds.append((title, articles))
|
||||
|
||||
|
||||
# special- columns
|
||||
col_articles = self.parse_col_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=ncolumn')
|
||||
if col_articles:
|
||||
feeds.append((u'\u5c08\u6b04 Columns', col_articles))
|
||||
elif __Region__ == 'Vancouver':
|
||||
for title, url in [(u'\u8981\u805e Headline', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/VAindex.htm'),
|
||||
(u'\u52a0\u570b Canada', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/VBindex.htm'),
|
||||
@ -332,6 +379,16 @@ class MPRecipe(BasicNewsRecipe):
|
||||
title = self.tag_to_string(a)
|
||||
url = a.get('href', False)
|
||||
url = 'http://news.mingpao.com/' + dateStr + '/' +url
|
||||
# replace the url to the print-friendly version
|
||||
if __ParsePFF__ == True:
|
||||
if url.rfind('Redirect') <> -1 and __InclPremium__ == True:
|
||||
url = re.sub(dateStr + '.*' + dateStr, dateStr, url)
|
||||
url = re.sub('%2F.*%2F', '/', url)
|
||||
title = title.replace(u'\u6536\u8cbb\u5167\u5bb9', '')
|
||||
url = url.replace('%2Etxt', '_print.htm')
|
||||
url = url.replace('%5F', '_')
|
||||
else:
|
||||
url = url.replace('.htm', '_print.htm')
|
||||
if url not in included_urls and url.rfind('Redirect') == -1:
|
||||
current_articles.append({'title': title, 'url': url, 'description':'', 'date':''})
|
||||
included_urls.append(url)
|
||||
@ -340,6 +397,8 @@ class MPRecipe(BasicNewsRecipe):
|
||||
|
||||
# parse from life.mingpao.com
|
||||
def parse_section2(self, url, keystr):
|
||||
br = mechanize.Browser()
|
||||
br.set_handle_redirect(False)
|
||||
self.get_fetchdate()
|
||||
soup = self.index_to_soup(url)
|
||||
a = soup.findAll('a', href=True)
|
||||
@ -350,7 +409,29 @@ class MPRecipe(BasicNewsRecipe):
|
||||
title = self.tag_to_string(i)
|
||||
url = 'http://life.mingpao.com/cfm/' + i.get('href', False)
|
||||
if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind(keystr) == -1):
|
||||
url = url.replace('dailynews3.cfm', 'dailynews3a.cfm') # use printed version of the article
|
||||
try:
|
||||
br.open_novisit(url)
|
||||
url = url.replace('dailynews3.cfm', 'dailynews3a.cfm') # use printed version of the article
|
||||
current_articles.append({'title': title, 'url': url, 'description': ''})
|
||||
included_urls.append(url)
|
||||
except:
|
||||
print 'skipping a premium article'
|
||||
current_articles.reverse()
|
||||
return current_articles
|
||||
|
||||
# parse from text file of life.mingpao.com
|
||||
def parse_section2_txt(self, url, keystr):
|
||||
self.get_fetchdate()
|
||||
soup = self.index_to_soup(url)
|
||||
a = soup.findAll('a', href=True)
|
||||
a.reverse()
|
||||
current_articles = []
|
||||
included_urls = []
|
||||
for i in a:
|
||||
title = self.tag_to_string(i)
|
||||
url = 'http://life.mingpao.com/cfm/' + i.get('href', False)
|
||||
if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind(keystr) == -1):
|
||||
url = url.replace('cfm/dailynews3.cfm?File=', 'ftp/Life3/') # use printed version of the article
|
||||
current_articles.append({'title': title, 'url': url, 'description': ''})
|
||||
included_urls.append(url)
|
||||
current_articles.reverse()
|
||||
@ -438,6 +519,162 @@ class MPRecipe(BasicNewsRecipe):
|
||||
current_articles.reverse()
|
||||
return current_articles
|
||||
|
||||
# preprocess those .txt and javascript based files
|
||||
def preprocess_raw_html(self, raw_html, url):
|
||||
new_html = raw_html
|
||||
if url.rfind('ftp') <> -1 or url.rfind('_print.htm') <> -1:
|
||||
if url.rfind('_print.htm') <> -1:
|
||||
# javascript based file
|
||||
splitter = re.compile(r'\n')
|
||||
new_raw_html = '<html><head><title>Untitled</title></head>'
|
||||
new_raw_html = new_raw_html + '<body>'
|
||||
for item in splitter.split(raw_html):
|
||||
if item.startswith('var heading1 ='):
|
||||
heading = item.replace('var heading1 = \'', '')
|
||||
heading = heading.replace('\'', '')
|
||||
heading = heading.replace(';', '')
|
||||
new_raw_html = new_raw_html + '<div class="heading">' + heading
|
||||
if item.startswith('var heading2 ='):
|
||||
heading = item.replace('var heading2 = \'', '')
|
||||
heading = heading.replace('\'', '')
|
||||
heading = heading.replace(';', '')
|
||||
if heading <> '':
|
||||
new_raw_html = new_raw_html + '<br>' + heading + '</div>'
|
||||
else:
|
||||
new_raw_html = new_raw_html + '</div>'
|
||||
if item.startswith('var content ='):
|
||||
content = item.replace("var content = ", '')
|
||||
content = content.replace('\'', '')
|
||||
content = content.replace(';', '')
|
||||
new_raw_html = new_raw_html + '<div class="content">' + content + '</div>'
|
||||
if item.startswith('var photocontent ='):
|
||||
photo = item.replace('var photocontent = \'', '')
|
||||
photo = photo.replace('\'', '')
|
||||
photo = photo.replace(';', '')
|
||||
photo = photo.replace('<tr>', '')
|
||||
photo = photo.replace('<td>', '')
|
||||
photo = photo.replace('</tr>', '')
|
||||
photo = photo.replace('</td>', '<br>')
|
||||
photo = photo.replace('class="photo"', '')
|
||||
new_raw_html = new_raw_html + '<div class="images">' + photo + '</div>'
|
||||
new_html = new_raw_html + '</body></html>'
|
||||
else:
|
||||
# .txt based file
|
||||
splitter = re.compile(r'\n') # Match non-digits
|
||||
new_raw_html = '<html><head><title>Untitled</title></head><body><div class="images">'
|
||||
next_is_img_txt = False
|
||||
title_started = False
|
||||
title_break_reached = False
|
||||
met_article_start_char = False
|
||||
for item in splitter.split(raw_html):
|
||||
item = item.strip()
|
||||
# if title already reached but break between title and content not yet found, record title_break_reached
|
||||
if title_started == True and title_break_reached == False and item == '':
|
||||
title_break_reached = True
|
||||
# if title reached and title_break_reached and met_article_start_char == False and item is not empty
|
||||
# start content
|
||||
elif title_started == True and title_break_reached == True and met_article_start_char == False:
|
||||
if item <> '':
|
||||
met_article_start_char = True
|
||||
new_raw_html = new_raw_html + '</div><div class="content"><p>' + item + '<p>\n'
|
||||
#if item.startswith(u'\u3010'):
|
||||
# met_article_start_char = True
|
||||
# new_raw_html = new_raw_html + '</div><div class="content"><p>' + item + '<p>\n'
|
||||
else:
|
||||
if next_is_img_txt == False:
|
||||
if item.startswith("=@"):
|
||||
print 'skip movie link'
|
||||
elif item.startswith("=?"):
|
||||
next_is_img_txt = True
|
||||
new_raw_html += '<img src="' + str(item)[2:].strip() + '.gif" /><p>\n'
|
||||
elif item.startswith('=='):
|
||||
next_is_img_txt = True
|
||||
if False:
|
||||
# TODO: check existence of .gif first
|
||||
newimg = '_' + item[2:].strip() + '.jpg'
|
||||
new_raw_html += '<img src="' + newimg + '" /><p>\n'
|
||||
else:
|
||||
new_raw_html += '<img src="' + str(item)[2:].strip() + '.jpg" /><p>\n'
|
||||
elif item.startswith('='):
|
||||
next_is_img_txt = True
|
||||
if False:
|
||||
# TODO: check existence of .gif first
|
||||
newimg = '_' + item[1:].strip() + '.jpg'
|
||||
new_raw_html += '<img src="' + newimg + '" /><p>\n'
|
||||
else:
|
||||
new_raw_html += '<img src="' + str(item)[1:].strip() + '.jpg" /><p>\n'
|
||||
else:
|
||||
if next_is_img_txt == False and met_article_start_char == False:
|
||||
if item <> '':
|
||||
if title_started == False:
|
||||
#print 'Title started at ', item
|
||||
new_raw_html = new_raw_html + '</div><div class="heading">' + item + '\n'
|
||||
title_started = True
|
||||
else:
|
||||
new_raw_html = new_raw_html + item + '\n'
|
||||
else:
|
||||
new_raw_html = new_raw_html + item + '<p>\n'
|
||||
else:
|
||||
next_is_img_txt = False
|
||||
new_raw_html = new_raw_html + item + '\n'
|
||||
new_html = new_raw_html + '</div></body></html>'
|
||||
#raw_html = raw_html.replace(u'<p>\u3010', u'\u3010')
|
||||
if __HiResImg__ == True:
|
||||
# TODO: add a _ in front of an image url
|
||||
if url.rfind('news.mingpao.com') > -1:
|
||||
imglist = re.findall('src="?.*?jpg"', new_html)
|
||||
br = mechanize.Browser()
|
||||
br.set_handle_redirect(False)
|
||||
for img in imglist:
|
||||
gifimg = img.replace('jpg"', 'gif"')
|
||||
try:
|
||||
br.open_novisit(url + "/../" + gifimg[5:len(gifimg)-1])
|
||||
new_html = new_html.replace(img, gifimg)
|
||||
except:
|
||||
# find the location of the first _
|
||||
pos = img.find('_')
|
||||
if pos > -1:
|
||||
# if found, insert _ after the first _
|
||||
newimg = img[0:pos] + '_' + img[pos:]
|
||||
new_html = new_html.replace(img, newimg)
|
||||
else:
|
||||
# if not found, insert _ after "
|
||||
new_html = new_html.replace(img[1:], '"_' + img[1:])
|
||||
elif url.rfind('life.mingpao.com') > -1:
|
||||
imglist = re.findall('src=\'?.*?jpg\'', new_html)
|
||||
br = mechanize.Browser()
|
||||
br.set_handle_redirect(False)
|
||||
#print 'Img list: ', imglist, '\n'
|
||||
for img in imglist:
|
||||
#print 'Found img: ', img
|
||||
gifimg = img.replace('jpg\'', 'gif\'')
|
||||
try:
|
||||
gifurl = re.sub(r'dailynews.*txt', '', url)
|
||||
br.open_novisit(gifurl + gifimg[5:len(gifimg)-1])
|
||||
new_html = new_html.replace(img, gifimg)
|
||||
except:
|
||||
pos = img.rfind('/')
|
||||
newimg = img[0:pos+1] + '_' + img[pos+1:]
|
||||
new_html = new_html.replace(img, newimg)
|
||||
# repeat with src quoted by double quotes, for text parsed from src txt
|
||||
imglist = re.findall('src="?.*?jpg"', new_html)
|
||||
for img in imglist:
|
||||
#print 'Found img: ', img
|
||||
gifimg = img.replace('jpg"', 'gif"')
|
||||
try:
|
||||
#print 'url', url
|
||||
pos = url.rfind('/')
|
||||
gifurl = url[:pos+1]
|
||||
#print 'try it:', gifurl + gifimg[5:len(gifimg)-1]
|
||||
br.open_novisit(gifurl + gifimg[5:len(gifimg)-1])
|
||||
new_html = new_html.replace(img, gifimg)
|
||||
except:
|
||||
pos = img.find('"')
|
||||
newimg = img[0:pos+1] + '_' + img[pos+1:]
|
||||
#print 'Use hi-res img', newimg
|
||||
new_html = new_html.replace(img, newimg)
|
||||
return new_html
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
for item in soup.findAll(style=True):
|
||||
del item['style']
|
||||
@ -447,77 +684,153 @@ class MPRecipe(BasicNewsRecipe):
|
||||
del item['absmiddle']
|
||||
return soup
|
||||
|
||||
def populate_article_metadata(self, article, soup, first):
|
||||
# thumbnails shouldn't be available if using hi-res images
|
||||
if __IncludeThumbnails__ and __HiResImg__ == False and first and hasattr(self, 'add_toc_thumbnail'):
|
||||
img = soup.find('img')
|
||||
if img is not None:
|
||||
self.add_toc_thumbnail(article, img['src'])
|
||||
|
||||
try:
|
||||
if __IncludeSummary__ and len(article.text_summary.strip()) == 0:
|
||||
# look for content
|
||||
articlebodies = soup.findAll('div',attrs={'id':'newscontent'})
|
||||
if not articlebodies:
|
||||
articlebodies = soup.findAll('div',attrs={'id':'newscontent01'})
|
||||
if not articlebodies:
|
||||
articlebodies = soup.findAll('div',attrs={'class':'content'})
|
||||
if not articlebodies:
|
||||
articlebodies = soup.findAll('div', attrs={'id':'font'})
|
||||
if articlebodies:
|
||||
for articlebody in articlebodies:
|
||||
if articlebody:
|
||||
# the text may or may not be enclosed in <p></p> tag
|
||||
paras = articlebody.findAll('p')
|
||||
if not paras:
|
||||
paras = articlebody
|
||||
textFound = False
|
||||
for p in paras:
|
||||
if not textFound:
|
||||
summary_candidate = self.tag_to_string(p).strip()
|
||||
summary_candidate = summary_candidate.replace(u'\u3010\u660e\u5831\u5c08\u8a0a\u3011', '', 1)
|
||||
if len(summary_candidate) > 0:
|
||||
article.summary = article.text_summary = summary_candidate
|
||||
textFound = True
|
||||
else:
|
||||
# display a simple text
|
||||
#article.summary = article.text_summary = u'\u66f4\u591a......'
|
||||
# display word counts
|
||||
counts = 0
|
||||
articlebodies = soup.findAll('div',attrs={'id':'newscontent'})
|
||||
if not articlebodies:
|
||||
articlebodies = soup.findAll('div',attrs={'id':'newscontent01'})
|
||||
if not articlebodies:
|
||||
articlebodies = soup.findAll('div',attrs={'class':'content'})
|
||||
if not articlebodies:
|
||||
articlebodies = soup.findAll('div', attrs={'id':'font'})
|
||||
if articlebodies:
|
||||
for articlebody in articlebodies:
|
||||
# the text may or may not be enclosed in <p></p> tag
|
||||
paras = articlebody.findAll('p')
|
||||
if not paras:
|
||||
paras = articlebody
|
||||
for p in paras:
|
||||
summary_candidate = self.tag_to_string(p).strip()
|
||||
counts += len(summary_candidate)
|
||||
article.summary = article.text_summary = u'\uff08' + str(counts) + u'\u5b57\uff09'
|
||||
except:
|
||||
self.log("Error creating article descriptions")
|
||||
return
|
||||
|
||||
# override from the one in version 0.8.31
|
||||
def create_opf(self, feeds, dir=None):
|
||||
if dir is None:
|
||||
dir = self.output_dir
|
||||
if __UseChineseTitle__ == True:
|
||||
if __Region__ == 'Hong Kong':
|
||||
title = u'\u660e\u5831 (\u9999\u6e2f)'
|
||||
elif __Region__ == 'Vancouver':
|
||||
title = u'\u660e\u5831 (\u6eab\u54e5\u83ef)'
|
||||
elif __Region__ == 'Toronto':
|
||||
title = u'\u660e\u5831 (\u591a\u502b\u591a)'
|
||||
else:
|
||||
title = self.short_title()
|
||||
# if not generating a periodical, force date to apply in title
|
||||
if __MakePeriodical__ == False:
|
||||
title = self.short_title()
|
||||
# change 1: allow our own flag to tell if a periodical is to be generated
|
||||
# also use customed date instead of current time
|
||||
if __MakePeriodical__ == False or self.output_profile.periodical_date_in_title:
|
||||
title = title + ' ' + self.get_fetchformatteddate()
|
||||
if True:
|
||||
mi = MetaInformation(title, [self.publisher])
|
||||
mi.publisher = self.publisher
|
||||
mi.author_sort = self.publisher
|
||||
if __MakePeriodical__ == True:
|
||||
mi.publication_type = 'periodical:'+self.publication_type+':'+self.short_title()
|
||||
else:
|
||||
mi.publication_type = self.publication_type+':'+self.short_title()
|
||||
#mi.timestamp = nowf()
|
||||
mi.timestamp = self.get_dtlocal()
|
||||
mi.comments = self.description
|
||||
if not isinstance(mi.comments, unicode):
|
||||
mi.comments = mi.comments.decode('utf-8', 'replace')
|
||||
#mi.pubdate = nowf()
|
||||
mi.pubdate = self.get_dtlocal()
|
||||
opf_path = os.path.join(dir, 'index.opf')
|
||||
ncx_path = os.path.join(dir, 'index.ncx')
|
||||
opf = OPFCreator(dir, mi)
|
||||
# Add mastheadImage entry to <guide> section
|
||||
mp = getattr(self, 'masthead_path', None)
|
||||
if mp is not None and os.access(mp, os.R_OK):
|
||||
from calibre.ebooks.metadata.opf2 import Guide
|
||||
ref = Guide.Reference(os.path.basename(self.masthead_path), os.getcwdu())
|
||||
ref.type = 'masthead'
|
||||
ref.title = 'Masthead Image'
|
||||
opf.guide.append(ref)
|
||||
# end of change 1
|
||||
# change 2: __appname__ replaced by newspaper publisher
|
||||
__appname__ = self.publisher
|
||||
mi = MetaInformation(title, [__appname__])
|
||||
mi.publisher = __appname__
|
||||
mi.author_sort = __appname__
|
||||
# change 3: use __MakePeriodical__ flag to tell if a periodical should be generated
|
||||
if __MakePeriodical__ == True:
|
||||
mi.publication_type = 'periodical:'+self.publication_type+':'+self.short_title()
|
||||
else:
|
||||
mi.publication_type = self.publication_type+':'+self.short_title()
|
||||
#mi.publication_type = 'periodical:'+self.publication_type+':'+self.short_title()
|
||||
# change 4: in the following, all the nowf() are changed to adjusted time
|
||||
# This one doesn't matter
|
||||
mi.timestamp = nowf()
|
||||
# change 5: skip listing the articles
|
||||
#article_titles, aseen = [], set()
|
||||
#for f in feeds:
|
||||
# for a in f:
|
||||
# if a.title and a.title not in aseen:
|
||||
# aseen.add(a.title)
|
||||
# article_titles.append(force_unicode(a.title, 'utf-8'))
|
||||
|
||||
manifest = [os.path.join(dir, 'feed_%d'%i) for i in range(len(feeds))]
|
||||
manifest.append(os.path.join(dir, 'index.html'))
|
||||
manifest.append(os.path.join(dir, 'index.ncx'))
|
||||
#mi.comments = self.description
|
||||
#if not isinstance(mi.comments, unicode):
|
||||
# mi.comments = mi.comments.decode('utf-8', 'replace')
|
||||
#mi.comments += ('\n\n' + _('Articles in this issue: ') + '\n' +
|
||||
# '\n\n'.join(article_titles))
|
||||
|
||||
# Get cover
|
||||
cpath = getattr(self, 'cover_path', None)
|
||||
if cpath is None:
|
||||
pf = open(os.path.join(dir, 'cover.jpg'), 'wb')
|
||||
if self.default_cover(pf):
|
||||
cpath = pf.name
|
||||
if cpath is not None and os.access(cpath, os.R_OK):
|
||||
opf.cover = cpath
|
||||
manifest.append(cpath)
|
||||
language = canonicalize_lang(self.language)
|
||||
if language is not None:
|
||||
mi.language = language
|
||||
# This one affects the pub date shown in kindle title
|
||||
#mi.pubdate = nowf()
|
||||
# now appears to need the time field to be > 12.00noon as well
|
||||
mi.pubdate = datetime.datetime(int(self.get_fetchyear()), int(self.get_fetchmonth()), int(self.get_fetchday()), 12, 30, 0)
|
||||
opf_path = os.path.join(dir, 'index.opf')
|
||||
ncx_path = os.path.join(dir, 'index.ncx')
|
||||
|
||||
# Get masthead
|
||||
mpath = getattr(self, 'masthead_path', None)
|
||||
if mpath is not None and os.access(mpath, os.R_OK):
|
||||
manifest.append(mpath)
|
||||
opf = OPFCreator(dir, mi)
|
||||
# Add mastheadImage entry to <guide> section
|
||||
mp = getattr(self, 'masthead_path', None)
|
||||
if mp is not None and os.access(mp, os.R_OK):
|
||||
from calibre.ebooks.metadata.opf2 import Guide
|
||||
ref = Guide.Reference(os.path.basename(self.masthead_path), os.getcwdu())
|
||||
ref.type = 'masthead'
|
||||
ref.title = 'Masthead Image'
|
||||
opf.guide.append(ref)
|
||||
|
||||
manifest = [os.path.join(dir, 'feed_%d'%i) for i in range(len(feeds))]
|
||||
manifest.append(os.path.join(dir, 'index.html'))
|
||||
manifest.append(os.path.join(dir, 'index.ncx'))
|
||||
|
||||
# Get cover
|
||||
cpath = getattr(self, 'cover_path', None)
|
||||
if cpath is None:
|
||||
pf = open(os.path.join(dir, 'cover.jpg'), 'wb')
|
||||
if self.default_cover(pf):
|
||||
cpath = pf.name
|
||||
if cpath is not None and os.access(cpath, os.R_OK):
|
||||
opf.cover = cpath
|
||||
manifest.append(cpath)
|
||||
|
||||
# Get masthead
|
||||
mpath = getattr(self, 'masthead_path', None)
|
||||
if mpath is not None and os.access(mpath, os.R_OK):
|
||||
manifest.append(mpath)
|
||||
|
||||
opf.create_manifest_from_files_in(manifest)
|
||||
for mani in opf.manifest:
|
||||
if mani.path.endswith('.ncx'):
|
||||
mani.id = 'ncx'
|
||||
if mani.path.endswith('mastheadImage.jpg'):
|
||||
mani.id = 'masthead-image'
|
||||
|
||||
entries = ['index.html']
|
||||
toc = TOC(base_path=dir)
|
||||
self.play_order_counter = 0
|
||||
self.play_order_map = {}
|
||||
|
||||
opf.create_manifest_from_files_in(manifest)
|
||||
for mani in opf.manifest:
|
||||
if mani.path.endswith('.ncx'):
|
||||
mani.id = 'ncx'
|
||||
if mani.path.endswith('mastheadImage.jpg'):
|
||||
mani.id = 'masthead-image'
|
||||
entries = ['index.html']
|
||||
toc = TOC(base_path=dir)
|
||||
self.play_order_counter = 0
|
||||
self.play_order_map = {}
|
||||
|
||||
def feed_index(num, parent):
|
||||
f = feeds[num]
|
||||
@ -532,13 +845,16 @@ class MPRecipe(BasicNewsRecipe):
|
||||
desc = None
|
||||
else:
|
||||
desc = self.description_limiter(desc)
|
||||
tt = a.toc_thumbnail if a.toc_thumbnail else None
|
||||
entries.append('%sindex.html'%adir)
|
||||
po = self.play_order_map.get(entries[-1], None)
|
||||
if po is None:
|
||||
self.play_order_counter += 1
|
||||
po = self.play_order_counter
|
||||
parent.add_item('%sindex.html'%adir, None, a.title if a.title else _('Untitled Article'),
|
||||
play_order=po, author=auth, description=desc)
|
||||
parent.add_item('%sindex.html'%adir, None,
|
||||
a.title if a.title else _('Untitled Article'),
|
||||
play_order=po, author=auth,
|
||||
description=desc, toc_thumbnail=tt)
|
||||
last = os.path.join(self.output_dir, ('%sindex.html'%adir).replace('/', os.sep))
|
||||
for sp in a.sub_pages:
|
||||
prefix = os.path.commonprefix([opf_path, sp])
|
||||
@ -555,7 +871,7 @@ class MPRecipe(BasicNewsRecipe):
|
||||
prefix = '/'.join('..'for i in range(2*len(re.findall(r'link\d+', last))))
|
||||
templ = self.navbar.generate(True, num, j, len(f),
|
||||
not self.has_single_feed,
|
||||
a.orig_url, self.publisher, prefix=prefix,
|
||||
a.orig_url, __appname__, prefix=prefix,
|
||||
center=self.center_navbar)
|
||||
elem = BeautifulSoup(templ.render(doctype='xhtml').decode('utf-8')).find('div')
|
||||
body.insert(len(body.contents), elem)
|
||||
@ -578,7 +894,7 @@ class MPRecipe(BasicNewsRecipe):
|
||||
if not desc:
|
||||
desc = None
|
||||
feed_index(i, toc.add_item('feed_%d/index.html'%i, None,
|
||||
f.title, play_order=po, description=desc, author=auth))
|
||||
f.title, play_order=po, description=desc, author=auth))
|
||||
|
||||
else:
|
||||
entries.append('feed_%d/index.html'%0)
|
||||
@ -592,3 +908,4 @@ class MPRecipe(BasicNewsRecipe):
|
||||
with nested(open(opf_path, 'wb'), open(ncx_path, 'wb')) as (opf_file, ncx_file):
|
||||
opf.render(opf_file, ncx_file)
|
||||
|
||||
|
||||
|
76
recipes/money_pl.recipe
Normal file
76
recipes/money_pl.recipe
Normal file
@ -0,0 +1,76 @@
|
||||
import re
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class FocusRecipe(BasicNewsRecipe):
|
||||
__license__ = 'GPL v3'
|
||||
__author__ = u'intromatyk <intromatyk@gmail.com>'
|
||||
language = 'pl'
|
||||
version = 1
|
||||
|
||||
title = u'Money.pl'
|
||||
category = u'News'
|
||||
description = u'Informacje finansowe z kraju i ze świata. Aktualne i archiwalne: notowania giełdowe, kursy walut, wskaźniki gospodarcze.'
|
||||
remove_empty_feeds= True
|
||||
no_stylesheets=True
|
||||
oldest_article = 1
|
||||
max_articles_per_feed = 100000
|
||||
recursions = 0
|
||||
|
||||
no_stylesheets = True
|
||||
remove_javascript = True
|
||||
|
||||
simultaneous_downloads = 2
|
||||
|
||||
r = re.compile('.*(?P<url>http:\/\/(www.money.pl)|(rss.feedsportal.com\/c)\/.*\.html?).*')
|
||||
keep_only_tags =[]
|
||||
keep_only_tags.append(dict(name = 'div', attrs = {'class' : 'artykul'}))
|
||||
remove_tags = [dict(name='ul', attrs={'class':'socialStuff'})]
|
||||
|
||||
extra_css = '''
|
||||
body {font-family: Arial,Helvetica,sans-serif ;}
|
||||
h1{text-align: left;}
|
||||
h2{font-size: medium; font-weight: bold;}
|
||||
p.lead {font-weight: bold; text-align: left;}
|
||||
.authordate {font-size: small; color: #696969;}
|
||||
.fot{font-size: x-small; color: #666666;}
|
||||
'''
|
||||
|
||||
feeds = [
|
||||
('Wiadomosci z kraju', 'http://money.pl.feedsportal.com/c/33900/f/612847/index.rss'),
|
||||
('Wiadomosci ze swiata', 'http://money.pl.feedsportal.com/c/33900/f/612848/index.rss'),
|
||||
('Gospodarka', 'http://money.pl.feedsportal.com/c/33900/f/612849/index.rss'),
|
||||
('Waluty', 'http://money.pl.feedsportal.com/c/33900/f/612850/index.rss'),
|
||||
('Gielda', 'http://money.pl.feedsportal.com/c/33900/f/612851/index.rss'),
|
||||
('Banki', 'http://money.pl.feedsportal.com/c/33900/f/612852/index.rss'),
|
||||
('Fundusze', 'http://money.pl.feedsportal.com/c/33900/f/612853/index.rss'),
|
||||
('Emerytury', 'http://money.pl.feedsportal.com/c/33900/f/612854/index.rss'),
|
||||
('Podatki', 'http://money.pl.feedsportal.com/c/33900/f/612855/index.rss'),
|
||||
('Ubezpieczenia', 'http://money.pl.feedsportal.com/c/33900/f/612856/index.rss'),
|
||||
('Poradniki', 'http://money.pl.feedsportal.com/c/33900/f/612857/index.rss'),
|
||||
('Raporty', 'http://money.pl.feedsportal.com/c/33900/f/612858/index.rss'),
|
||||
('Motoryzacja', 'http://money.pl.feedsportal.com/c/33900/f/612859/index.rss'),
|
||||
('Manager', 'http://money.pl.feedsportal.com/c/33900/f/612860/index.rss'),
|
||||
('Dla firm', 'http://money.pl.feedsportal.com/c/33900/f/612861/index.rss'),
|
||||
('Prawo', 'http://money.pl.feedsportal.com/c/33900/f/612862/index.rss'),
|
||||
('Nieruchomosci', 'http://money.pl.feedsportal.com/c/33900/f/612863/index.rss'),
|
||||
('Praca', 'http://money.pl.feedsportal.com/c/33900/f/612864/index.rss'),
|
||||
|
||||
|
||||
]
|
||||
|
||||
def print_version(self, url):
|
||||
if url.count ('money.pl.feedsportal.com'):
|
||||
u = url.find('0Cartykul0C')
|
||||
u = 'http://www.m.money.pl/wiadomosci/artykul/' + url[u + 21:]
|
||||
u = u.replace('0C', '/')
|
||||
u = u.replace('A', '')
|
||||
u = u.replace ('0E','-')
|
||||
u = u.replace ('0P',';')
|
||||
u = u.replace ('0H',',')
|
||||
u = u.replace ('0B','.')
|
||||
u = u.replace (',0,',',-1,')
|
||||
u = u.replace('0Tutm0Isource0Frss0Gutm0Imedium0Frss0Gutm0Icampaign0Frss/story01.htm', '')
|
||||
else:
|
||||
u = url.replace('/nc/1','/do-druku/1')
|
||||
return u
|
@ -1,9 +1,7 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
|
||||
__copyright__ = '2009-2011, Darko Miletic <darko.miletic at gmail.com>'
|
||||
'''
|
||||
moneynews.newsmax.com
|
||||
www.moneynews.com
|
||||
'''
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
@ -12,40 +10,40 @@ class MoneyNews(BasicNewsRecipe):
|
||||
title = 'Moneynews.com'
|
||||
__author__ = 'Darko Miletic'
|
||||
description = 'Financial news worldwide'
|
||||
publisher = 'moneynews.com'
|
||||
language = 'en'
|
||||
|
||||
publisher = 'Newsmax.com'
|
||||
language = 'en'
|
||||
category = 'news, finances, USA, business'
|
||||
oldest_article = 2
|
||||
max_articles_per_feed = 100
|
||||
no_stylesheets = True
|
||||
use_embedded_content = False
|
||||
encoding = 'cp1252'
|
||||
|
||||
html2lrf_options = [
|
||||
'--comment', description
|
||||
, '--category', category
|
||||
, '--publisher', publisher
|
||||
, '--ignore-tables'
|
||||
]
|
||||
|
||||
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\nlinearize_tables=True'
|
||||
|
||||
encoding = 'utf8'
|
||||
extra_css = 'img{display: block} body{font-family: Arial, Helvetica, sans-serif}'
|
||||
|
||||
conversion_options = {
|
||||
'comment' : description
|
||||
, 'tags' : category
|
||||
, 'publisher' : publisher
|
||||
, 'language' : language
|
||||
, 'linearize_tables' : True
|
||||
}
|
||||
|
||||
feeds = [
|
||||
(u'Street Talk' , u'http://moneynews.newsmax.com/xml/streettalk.xml' )
|
||||
,(u'Finance News' , u'http://moneynews.newsmax.com/xml/FinanceNews.xml' )
|
||||
,(u'Economy' , u'http://moneynews.newsmax.com/xml/economy.xml' )
|
||||
,(u'Companies' , u'http://moneynews.newsmax.com/xml/companies.xml' )
|
||||
,(u'Markets' , u'http://moneynews.newsmax.com/xml/Markets.xml' )
|
||||
,(u'Investing & Analysis' , u'http://moneynews.newsmax.com/xml/investing.xml' )
|
||||
(u'Street Talk' , u'http://www.moneynews.com/rss/StreetTalk/8.xml' )
|
||||
,(u'Finance News' , u'http://www.moneynews.com/rss/FinanceNews/4.xml' )
|
||||
,(u'Economy' , u'http://www.moneynews.com/rss/Economy/2.xml' )
|
||||
,(u'Companies' , u'http://www.moneynews.com/rss/Companies/6.xml' )
|
||||
,(u'Markets' , u'http://www.moneynews.com/rss/Markets/7.xml' )
|
||||
,(u'Investing & Analysis' , u'http://www.moneynews.com/rss/InvestingAnalysis/17.xml')
|
||||
]
|
||||
|
||||
|
||||
keep_only_tags = [dict(name='table', attrs={'class':'copy'})]
|
||||
|
||||
keep_only_tags = [dict(name='div', attrs={'class':'copy'})]
|
||||
|
||||
remove_tags = [
|
||||
dict(name='td' , attrs={'id':'article_fontsize'})
|
||||
,dict(name='table', attrs={'id':'toolbox' })
|
||||
,dict(name='tr' , attrs={'id':'noprint3' })
|
||||
dict(attrs={'class':['MsoNormal', 'MsoNoSpacing']}),
|
||||
dict(name=['object','link','embed','form','meta'])
|
||||
]
|
||||
|
||||
|
||||
def print_version(self, url):
|
||||
nodeid = url.rpartition('/')[2]
|
||||
return 'http://www.moneynews.com/PrintTemplate?nodeid=' + nodeid
|
||||
|
@ -50,7 +50,7 @@ class Moscowtimes(BasicNewsRecipe):
|
||||
dict(name='div', attrs={'class':['photo_nav','phototext']})
|
||||
,dict(name=['iframe','meta','base','link','embed','object'])
|
||||
]
|
||||
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
for lnk in soup.findAll('a'):
|
||||
if lnk.string is not None:
|
||||
@ -58,13 +58,13 @@ class Moscowtimes(BasicNewsRecipe):
|
||||
lnk.replaceWith(ind)
|
||||
return soup
|
||||
|
||||
def print_version(self, url):
|
||||
def print_version(self, url):
|
||||
return url.replace('.themoscowtimes.com/','.themoscowtimes.com/print/')
|
||||
|
||||
def get_cover_url(self):
|
||||
cover_url = None
|
||||
href = 'http://www.themoscowtimes.com/pdf/'
|
||||
soup = self.index_to_soup(href)
|
||||
soup = self.index_to_soup(href)
|
||||
div = soup.find('div',attrs={'class':'left'})
|
||||
if div:
|
||||
a = div.find('a')
|
||||
|
143
recipes/mwjournal.recipe
Normal file
143
recipes/mwjournal.recipe
Normal file
@ -0,0 +1,143 @@
|
||||
##
|
||||
## Title: Microwave Journal RSS recipe
|
||||
## Contact: Kiavash (use Mobile Read)
|
||||
##
|
||||
## License: GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html
|
||||
## Copyright: Kiavash
|
||||
##
|
||||
## Written: Jan 2012
|
||||
## Last Edited: Jan 2012
|
||||
##
|
||||
|
||||
__license__ = 'GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html'
|
||||
__copyright__ = 'Kiavash'
|
||||
__author__ = 'Kaivash'
|
||||
|
||||
'''
|
||||
Microwave Journal Monthly Magazine
|
||||
You need to sign up (free) and get username/password.
|
||||
'''
|
||||
|
||||
import re # Import the regular expressions module.
|
||||
from calibre.ptempfile import TemporaryFile # we need this for saving to a temp file
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class MWJournal(BasicNewsRecipe):
|
||||
# Title to use for the ebook.
|
||||
title = u'Microwave Journal'
|
||||
__author__ = 'Kiavash'
|
||||
language = 'en'
|
||||
|
||||
#A brief description for the ebook.
|
||||
description = u'Microwave Journal web site ebook created using rss feeds.'
|
||||
|
||||
# Set publisher and publication type.
|
||||
publisher = 'Horizon House'
|
||||
publication_type = 'magazine'
|
||||
|
||||
oldest_article = 31 # monthly published magazine. Some months are 31 days!
|
||||
max_articles_per_feed = 100
|
||||
remove_empty_feeds = True
|
||||
auto_cleanup = True
|
||||
|
||||
# Disable stylesheets and javascript from site.
|
||||
no_stylesheets = True
|
||||
remove_javascript = True
|
||||
|
||||
asciiize = True # Converts all none ascii characters to their ascii equivalents
|
||||
|
||||
needs_subscription = True # oh yeah... we need to login btw.
|
||||
|
||||
# Timeout for fetching files from the server in seconds. The default of 120 seconds, seems somewhat excessive.
|
||||
timeout = 30
|
||||
|
||||
# Specify extra CSS - overrides ALL other CSS (IE. Added last).
|
||||
|
||||
extra_css = 'body { font-family: verdana, helvetica, sans-serif; } \
|
||||
.introduction, .first { font-weight: bold; } \
|
||||
.cross-head { font-weight: bold; font-size: 125%; } \
|
||||
.cap, .caption { display: block; font-size: 80%; font-style: italic; } \
|
||||
.cap, .caption, .caption img, .caption span { display: block; margin: 5px auto; } \
|
||||
.byl, .byd, .byline img, .byline-name, .byline-title, .author-name, .author-position, \
|
||||
.correspondent-portrait img, .byline-lead-in, .name, .bbc-role { display: block; \
|
||||
font-size: 80%; font-style: italic; margin: 1px auto; } \
|
||||
.story-date, .published { font-size: 80%; } \
|
||||
table { width: 100%; } \
|
||||
td img { display: block; margin: 5px auto; } \
|
||||
ul { padding-top: 10px; } \
|
||||
ol { padding-top: 10px; } \
|
||||
li { padding-top: 5px; padding-bottom: 5px; } \
|
||||
h1 { font-size: 175%; font-weight: bold; } \
|
||||
h2 { font-size: 150%; font-weight: bold; } \
|
||||
h3 { font-size: 125%; font-weight: bold; } \
|
||||
h4, h5, h6 { font-size: 100%; font-weight: bold; }'
|
||||
|
||||
remove_tags = [
|
||||
dict(name='div', attrs={'class':'boxadzonearea350'}), # Removes banner ads
|
||||
dict(name='font', attrs={'class':'footer'}), # remove fonts if you do like your fonts more! Comment out to use website's fonts
|
||||
dict(name='div', attrs={'class':'newsarticlead'})
|
||||
]
|
||||
|
||||
# Remove various tag attributes to improve the look of the ebook pages.
|
||||
remove_attributes = [ 'border', 'cellspacing', 'align', 'cellpadding', 'colspan',
|
||||
'valign', 'vspace', 'hspace', 'alt', 'width', 'height' ]
|
||||
|
||||
# Remove the line breaks as well as href links. Books don't have links generally speaking
|
||||
preprocess_regexps = [(re.compile(r'<br[ ]*/>', re.IGNORECASE), lambda m: ''),
|
||||
(re.compile(r'<br[ ]*clear.*/>', re.IGNORECASE), lambda m: ''),
|
||||
(re.compile(r'<a.*?>'), lambda h1: ''),
|
||||
(re.compile(r'</a>'), lambda h2: '')
|
||||
]
|
||||
|
||||
# Select the feeds that you are interested.
|
||||
feeds = [
|
||||
(u'Current Issue', u'http://www.mwjournal.com/rss/Rss.asp?type=99'),
|
||||
(u'Industry News', u'http://www.mwjournal.com/rss/Rss.asp?type=1'),
|
||||
(u'Resources', u'http://www.mwjournal.com/rss/Rss.asp?type=3'),
|
||||
(u'Buyer\'s Guide', u'http://www.mwjournal.com/rss/Rss.asp?type=5'),
|
||||
(u'Events', u'http://www.mwjournal.com/rss/Rss.asp?type=2'),
|
||||
(u'All Updates', u'http://www.mwjournal.com/rss/Rss.asp?type=0'),
|
||||
]
|
||||
|
||||
# No magazine is complete without cover. Let's get it then!
|
||||
# The function is adapted from the Economist recipe
|
||||
def get_cover_url(self):
|
||||
cover_url = None
|
||||
cover_page_location = 'http://www.mwjournal.com/Journal/' # Cover image is located on this page
|
||||
soup = self.index_to_soup(cover_page_location)
|
||||
cover_item = soup.find('img',attrs={'src':lambda x: x and '/IssueImg/3_MWJ_CurrIss_CoverImg' in x}) # There are three files named cover, we want the highest resolution which is the 3rd image. So we look for the pattern. Remember that the name of the cover image changes every month so we cannot search for the complete name. Instead we are searching for the pattern
|
||||
if cover_item:
|
||||
cover_url = 'http://www.mwjournal.com' + cover_item['src'].strip() # yeah! we found it. Let's fetch the image file and pass it as cover to calibre
|
||||
return cover_url
|
||||
|
||||
def print_version(self, url):
|
||||
if url.find('/Journal/article.asp?HH_ID=') >= 0:
|
||||
return self.browser.open_novisit(url).geturl().replace('/Journal/article.asp?HH_ID=', '/Journal/Print.asp?Id=')
|
||||
elif url.find('/News/article.asp?HH_ID=') >= 0:
|
||||
return self.browser.open_novisit(url).geturl().replace('/News/article.asp?HH_ID=', '/Journal/Print.asp?Id=')
|
||||
elif url.find('/Resources/TechLib.asp?HH_ID=') >= 0:
|
||||
return self.browser.open_novisit(url).geturl().replace('/Resources/TechLib.asp?HH_ID=', '/Resources/PrintRessource.asp?Id=')
|
||||
|
||||
def get_browser(self):
|
||||
'''
|
||||
Microwave Journal website, directs the login page to omeda.com once login info is submitted, omeda.com redirects to mwjournal.com with again the browser logs in into that site (hidden from the user). To overcome this obsticle, first login page is fetch and its output is stored to an HTML file. Then the HTML file is opened again and second login form is submitted (Many thanks to Barty which helped with second page login).
|
||||
'''
|
||||
br = BasicNewsRecipe.get_browser()
|
||||
if self.username is not None and self.password is not None:
|
||||
url = ('http://www.omeda.com/cgi-win/mwjreg.cgi?m=login') # main login page.
|
||||
br.open(url) # fetch the 1st login page
|
||||
br.select_form('login') # finds the login form
|
||||
br['EMAIL_ADDRESS'] = self.username # fills the username
|
||||
br['PASSWORD'] = self.password # fills the password
|
||||
raw = br.submit().read() # submit the form and read the 2nd login form
|
||||
# save it to an htm temp file (from ESPN recipe written by Kovid Goyal kovid@kovidgoyal.net
|
||||
with TemporaryFile(suffix='.htm') as fname:
|
||||
with open(fname, 'wb') as f:
|
||||
f.write(raw)
|
||||
br.open_local_file(fname)
|
||||
br.select_form(nr=0) # finds submit on the 2nd form
|
||||
didwelogin = br.submit().read() # submit it and read the return html
|
||||
if 'Welcome ' not in didwelogin: # did it login successfully? Is Username/password correct?
|
||||
raise Exception('Failed to login, are you sure your username and password are correct?')
|
||||
#login is done
|
||||
return br
|
@ -1,14 +1,25 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
##
|
||||
## Title: New Journal of Physics
|
||||
## License: GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html
|
||||
## Copyright: Chema Cort\xe9s
|
||||
##
|
||||
## Written: Jan 2011
|
||||
## Last Edited: Jan 2012 - by Kiavash
|
||||
##
|
||||
|
||||
__license__ = 'GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html'
|
||||
__copyright__ = u'Chema Cort\xe9s - 2011-01-05'
|
||||
__version__ = 'v0.01'
|
||||
__date__ = '2011-01-05'
|
||||
__version__ = 'v0.5.0'
|
||||
__date__ = '2012-01-13'
|
||||
|
||||
'''
|
||||
njp.org
|
||||
'''
|
||||
|
||||
import re # Import the regular expressions module.
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class NewJournalOfPhysics(BasicNewsRecipe):
|
||||
@ -18,15 +29,61 @@ class NewJournalOfPhysics(BasicNewsRecipe):
|
||||
publisher = u'IOP (Institute of Physics)'
|
||||
category = 'physics, journal, science'
|
||||
language = 'en'
|
||||
|
||||
oldest_article = 30
|
||||
max_articles_per_feed = 100
|
||||
|
||||
keep_only_tags = [dict(id=['fulltextContainer'])]
|
||||
no_stylesheets=True
|
||||
use_embedded_content=False
|
||||
|
||||
|
||||
feeds = [(u'Latest Papers', u'http://iopscience.iop.org/1367-2630/?rss=1')]
|
||||
|
||||
cover_url = 'http://images.iop.org/journals_icons/Info/1367-2630/cover.gif'
|
||||
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 30
|
||||
timeout = 30
|
||||
|
||||
no_stylesheets = True
|
||||
use_embedded_content = False
|
||||
remove_javascript = True
|
||||
remove_empty_feeds = True
|
||||
asciiize = True # Converts all none ascii characters to their ascii equivalents
|
||||
|
||||
keep_only_tags = [
|
||||
dict(id=['articleEvoContainer']),
|
||||
]
|
||||
|
||||
remove_tags = [
|
||||
dict(name='div', attrs={'class':'affiliations'}), # Removes Shoow Affiliations
|
||||
dict(name='div', attrs={'class':'abst-icon-links'}), # Removes Tags and PDF export
|
||||
dict(name='p', attrs={'class':'studyimage'}), # remove Studay image
|
||||
dict(name='a', attrs={'class':'icon powerpoint'}), # remove Export to PowerPoint Slide
|
||||
dict(name='a', attrs={'title':'CrossRef'}), # remove CrossRef icon
|
||||
dict(name='a', attrs={'title':'PubMed'}), # remove PubMed icon
|
||||
dict(name='a', attrs={'e4f5426941':'true'}), # remove cross ref image
|
||||
dict(name='img', attrs={'src':''}), # remove empty image
|
||||
dict(name='a', attrs={'class':'closeChap'}), # remove 'Close'
|
||||
dict(name='ul', attrs={'class':'breadcrumbs'}), # remove Top breadcrumbs
|
||||
]
|
||||
|
||||
extra_css = 'body { font-family: verdana, helvetica, sans-serif; } \
|
||||
.introduction, .first { font-weight: bold; } \
|
||||
.cross-head { font-weight: bold; font-size: 125%; } \
|
||||
.cap, .caption { display: block; font-size: 80%; font-style: italic; } \
|
||||
.cap, .caption, .caption img, .caption span { display: block; margin: 5px auto; } \
|
||||
.byl, .byd, .byline img, .byline-name, .byline-title, .author-name, .author-position, \
|
||||
.correspondent-portrait img, .byline-lead-in, .name, .bbc-role { display: block; \
|
||||
font-size: 80%; font-style: italic; margin: 1px auto; } \
|
||||
.story-date, .published { font-size: 80%; } \
|
||||
table { width: 100%; } \
|
||||
td img { display: block; margin: 5px auto; } \
|
||||
ul { padding-top: 10px; } \
|
||||
ol { padding-top: 10px; } \
|
||||
li { padding-top: 5px; padding-bottom: 5px; } \
|
||||
h1 { font-size: 175%; font-weight: bold; } \
|
||||
h2 { font-size: 150%; font-weight: bold; } \
|
||||
h3 { font-size: 125%; font-weight: bold; } \
|
||||
h4, h5, h6 { font-size: 100%; font-weight: bold; }'
|
||||
|
||||
# Remove the line breaks.
|
||||
preprocess_regexps = [(re.compile(r'<br[ ]*/>', re.IGNORECASE), lambda m: ''),
|
||||
(re.compile(r'<br[ ]*clear.*/>', re.IGNORECASE), lambda m: ''),
|
||||
]
|
||||
|
||||
def print_version(self, url):
|
||||
return url+"/fulltext"
|
||||
return url+"/article"
|
54
recipes/nol.recipe
Normal file
54
recipes/nol.recipe
Normal file
@ -0,0 +1,54 @@
|
||||
################################################################################
|
||||
#Description: http://nol.hu/ RSS channel
|
||||
#Author: Bigpapa (bigpapabig@hotmail.com)
|
||||
#Date: 2011.12.18. - V1.1
|
||||
################################################################################
|
||||
|
||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||
|
||||
class NOL(BasicNewsRecipe):
|
||||
title = u'NOL'
|
||||
__author__ = 'Bigpapa'
|
||||
oldest_article = 5
|
||||
max_articles_per_feed = 5 # Az adott e-bookban tarolt cikkek feedenkenti maximalis szamat adja meg.
|
||||
no_stylesheets = True
|
||||
#delay = 1
|
||||
use_embedded_content = False
|
||||
encoding = 'utf8'
|
||||
language = 'hu'
|
||||
publication_type = 'newsportal'
|
||||
|
||||
conversion_options ={
|
||||
'linearize_tables' : True,
|
||||
}
|
||||
|
||||
keep_only_tags = [
|
||||
dict(name='table', attrs={'class':['article-box']})
|
||||
]
|
||||
|
||||
remove_tags = [
|
||||
|
||||
dict(name='div', attrs={'class':['h','ad-container-outer','tags noborder','ad-container-inner','image-container-lead','tags','related-container']}),
|
||||
dict(name='h4'),
|
||||
dict(name='tfoot'),
|
||||
dict(name='td', attrs={'class':['foot']}),
|
||||
dict(name='span', attrs={'class':['image-container-caption']}),
|
||||
]
|
||||
|
||||
|
||||
feeds = [
|
||||
# (u'V\xe1logat\xe1s', 'http://nol.hu/feed/valogatas.rss'),
|
||||
(u'Belf\xf6ld', 'http://nol.hu/feed/belfold.rss'),
|
||||
(u'K\xfclf\xf6ld', 'http://nol.hu/feed/kulfold.rss'),
|
||||
(u'Gazdas\xe1g', 'http://nol.hu/feed/gazdasag.rss'),
|
||||
(u'V\xe9lem\xe9ny', 'http://nol.hu/feed/velemeny.rss'),
|
||||
(u'Kult\xfara', 'http://nol.hu/feed/kult.rss'),
|
||||
(u'Tud/Tech', 'http://nol.hu/feed/tud-tech.rss'),
|
||||
(u'Sport', 'http://nol.hu/feed/sport.rss'),
|
||||
(u'Noller', 'http://nol.hu/feed/noller.rss'),
|
||||
(u'Mozaik', 'http://nol.hu/feed/mozaik.rss'),
|
||||
(u'Utaz\xe1s', 'http://nol.hu/feed/utazas.rss'),
|
||||
(u'Aut\xf3', 'http://nol.hu/feed/auto.rss'),
|
||||
(u'Voks', 'http://nol.hu/feed/voks.rss'),
|
||||
|
||||
]
|
100
recipes/novilist_novine_hr.recipe
Normal file
100
recipes/novilist_novine_hr.recipe
Normal file
@ -0,0 +1,100 @@
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2012, Darko Miletic <darko.miletic at gmail.com>'
|
||||
'''
|
||||
novine.novilist.hr
|
||||
'''
|
||||
|
||||
import re
|
||||
from calibre import strftime
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class NoviList_hr(BasicNewsRecipe):
|
||||
title = 'Novi List'
|
||||
__author__ = 'Darko Miletic'
|
||||
description = 'Vijesti iz Hrvatske'
|
||||
publisher = 'NOVI LIST d.d.'
|
||||
category = 'Novi list, politika, hrvatski dnevnik, Novine, Hrvatska, Croatia, News, newspaper, Hrvatski,Primorje, dnevni list, Rijeka'
|
||||
oldest_article = 2
|
||||
max_articles_per_feed = 200
|
||||
no_stylesheets = True
|
||||
encoding = 'cp1250'
|
||||
use_embedded_content = False
|
||||
language = 'hr'
|
||||
remove_empty_feeds = True
|
||||
publication_type = 'newspaper'
|
||||
needs_subscription = True
|
||||
masthead_url = 'http://novine.novilist.hr/images/system/novilist-logo.jpg'
|
||||
index = 'http://novine.novilist.hr/'
|
||||
extra_css = """
|
||||
@font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)}
|
||||
body{font-family: Geneva,Arial,Helvetica,Swiss,sans1,sans-serif }
|
||||
img{display:block; margin-bottom: 0.4em; margin-top: 0.4em}
|
||||
.nadnaslov,.podnaslov{font-size: small; display: block; margin-bottom: 1em}
|
||||
.naslov{font-size: x-large; color: maroon; font-weight: bold; display: block; margin-bottom: 1em;}
|
||||
p{display: block}
|
||||
"""
|
||||
|
||||
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
|
||||
|
||||
conversion_options = {
|
||||
'comment' : description
|
||||
, 'tags' : category
|
||||
, 'publisher' : publisher
|
||||
, 'language' : language
|
||||
, 'linearize_tables' : True
|
||||
}
|
||||
|
||||
keep_only_tags = [
|
||||
dict(name='td', attrs={'class':['nadnaslov', 'naslov', 'podnaslov']}),
|
||||
dict(name='font', attrs={'face':'Geneva,Arial,Helvetica,Swiss'})
|
||||
]
|
||||
|
||||
remove_tags = [dict(name=['meta', 'link', 'iframe', 'embed', 'object'])]
|
||||
remove_attributes=['border', 'lang', 'size', 'face', 'bgcolor']
|
||||
|
||||
def get_browser(self):
|
||||
br = BasicNewsRecipe.get_browser()
|
||||
if self.username is not None and self.password is not None:
|
||||
br.open(self.index + 'loginnow.asp')
|
||||
br.select_form(nr=0)
|
||||
br['username'] = self.username
|
||||
br['password'] = self.password
|
||||
br.submit()
|
||||
return br
|
||||
|
||||
def parse_index(self):
|
||||
articles = []
|
||||
count = 0
|
||||
soup = self.index_to_soup(self.index)
|
||||
#cover url
|
||||
for alink in soup.findAll('a'):
|
||||
if alink['href'].startswith('images/clanci/DOC_'):
|
||||
self.cover_url = self.index + alink['href']
|
||||
#feeds
|
||||
for item in soup.findAll('td',attrs={'class':'tocrubrika'}):
|
||||
count = count +1
|
||||
if self.test and count > 2:
|
||||
return articles
|
||||
aitem = item.a
|
||||
section = self.tag_to_string(aitem)
|
||||
feedlink = self.index + aitem['href']
|
||||
feedpage = self.index_to_soup(feedlink)
|
||||
self.report_progress(0, _('Fetching feed')+' %s...'%(section))
|
||||
inarts = []
|
||||
for alink in feedpage.findAll('a',attrs={'class':'naslovlinkdesno'}):
|
||||
url = self.index + alink['href']
|
||||
inarts.append({
|
||||
'title' :self.tag_to_string(alink)
|
||||
,'date' :strftime(self.timefmt)
|
||||
,'url' :url
|
||||
,'description':''
|
||||
})
|
||||
if self.remove_empty_feeds:
|
||||
if inarts:
|
||||
articles.append((section,inarts))
|
||||
else:
|
||||
articles.append((section,inarts))
|
||||
return articles
|
||||
|
||||
def print_version(self, url):
|
||||
return url.replace('?WCI=Rubrike&','?WCI=Pretrazivac&')
|
49
recipes/novilist_portal_hr.recipe
Normal file
49
recipes/novilist_portal_hr.recipe
Normal file
@ -0,0 +1,49 @@
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2012, Darko Miletic <darko.miletic at gmail.com>'
|
||||
'''
|
||||
www.novilist.hr
|
||||
'''
|
||||
|
||||
import re
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class NoviList_Portal_hr(BasicNewsRecipe):
|
||||
title = 'Novi List - online portal'
|
||||
__author__ = 'Darko Miletic'
|
||||
description = 'Portal Novog Lista'
|
||||
publisher = 'NOVI LIST d.d.'
|
||||
category = 'Novi list, politika, hrvatski dnevnik, Novine, Hrvatska, Croatia, News, newspaper, Hrvatski,Primorje, dnevni list, Rijeka'
|
||||
oldest_article = 2
|
||||
max_articles_per_feed = 200
|
||||
no_stylesheets = True
|
||||
encoding = 'utf8'
|
||||
use_embedded_content = False
|
||||
language = 'hr'
|
||||
publication_type = 'newsportal'
|
||||
masthead_url = 'http://www.novilist.hr/design/novilist/images/logo-print.gif'
|
||||
extra_css = """
|
||||
body{font-family: Geneva,Arial,Helvetica,Swiss,sans-serif }
|
||||
h1{font-family: Georgia,serif}
|
||||
img{display:block; margin-bottom: 0.4em; margin-top: 0.4em}
|
||||
"""
|
||||
|
||||
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
|
||||
|
||||
conversion_options = {
|
||||
'comment' : description
|
||||
, 'tags' : category
|
||||
, 'publisher' : publisher
|
||||
, 'language' : language
|
||||
, 'linearize_tables' : True
|
||||
}
|
||||
|
||||
keep_only_tags = [dict(name='div', attrs={'id':'content'})]
|
||||
|
||||
remove_tags = [dict(name=['meta', 'link', 'iframe', 'embed', 'object'])]
|
||||
remove_attributes=['border', 'lang']
|
||||
|
||||
feeds = [(u'Vijesti', u'http://www.novilist.hr/rss/feed/sve.xml')]
|
||||
|
||||
|
||||
def print_version(self, url):
|
||||
return url.replace('http://www.novilist.hr/','http://www.novilist.hr/layout/set/print/')
|
@ -325,7 +325,8 @@ class NYTimes(BasicNewsRecipe):
|
||||
'''
|
||||
def get_the_soup(docEncoding, url_or_raw, raw=False) :
|
||||
if re.match(r'\w+://', url_or_raw):
|
||||
f = self.browser.open(url_or_raw)
|
||||
br = self.clone_browser(self.browser)
|
||||
f = br.open_novisit(url_or_raw)
|
||||
_raw = f.read()
|
||||
f.close()
|
||||
if not _raw:
|
||||
|
@ -364,7 +364,8 @@ class NYTimes(BasicNewsRecipe):
|
||||
'''
|
||||
def get_the_soup(docEncoding, url_or_raw, raw=False) :
|
||||
if re.match(r'\w+://', url_or_raw):
|
||||
f = self.browser.open(url_or_raw)
|
||||
br = self.clone_browser(self.browser)
|
||||
f = br.open_novisit(url_or_raw)
|
||||
_raw = f.read()
|
||||
f.close()
|
||||
if not _raw:
|
||||
|
77
recipes/opinion_bo.recipe
Normal file
77
recipes/opinion_bo.recipe
Normal file
@ -0,0 +1,77 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2011, Piet van Oostrum <piet@vanoostrum.org>'
|
||||
'''
|
||||
www.opinion.com.bo
|
||||
'''
|
||||
|
||||
from calibre import strftime
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class Opinion_Bol(BasicNewsRecipe):
|
||||
title = u'Opinión - Bolivia'
|
||||
__author__ = 'Piet van Oostrum'
|
||||
description = u'Opinión diario de circulación nacional, Cochabamba, Bolivia'
|
||||
publisher = 'Coboce Ltda - Editora Opinión'
|
||||
category = 'news, politics, Bolivia'
|
||||
version = 1
|
||||
oldest_article = 1
|
||||
max_articles_per_feed = 20
|
||||
no_stylesheets = True
|
||||
encoding = 'utf-8'
|
||||
use_embedded_content = False
|
||||
language = 'es_BO'
|
||||
publication_type = 'newspaper'
|
||||
delay = 1
|
||||
remove_empty_feeds = True
|
||||
|
||||
cover_url = strftime('http://www.opinion.com.bo/opinion/articulos/%Y/%m%d/fotos/portada_650.jpg')
|
||||
masthead_url = 'http://opinion.com.bo/opinion/articulos/imagenes/logo_opinion.gif'
|
||||
extra_css = """body{font-family: Helvetica,Arial,sans-serif}
|
||||
.seccion_encabezado_nota_inte{font-size: 1.1em;
|
||||
font-weight: bold;}
|
||||
.autor_nota_inte{color: #999999; font-size: 0.8em;
|
||||
margin-bottom: 0.5em; text-align: right;}
|
||||
.pie{font-size: 0.8em;}"""
|
||||
|
||||
conversion_options = {
|
||||
'comment' : description
|
||||
, 'tags' : category
|
||||
, 'publisher' : publisher
|
||||
, 'language' : language
|
||||
}
|
||||
|
||||
keep_only_tags = [dict(name='div', attrs={'class':'columna_izq_nota_intererior'})]
|
||||
|
||||
remove_tags = [dict(name=['meta','link','form','iframe','embed','object','style']),
|
||||
dict(name='div', attrs={'class':'ocultar'})]
|
||||
remove_attributes = ['width','height']
|
||||
|
||||
feeds = [
|
||||
(u'El País' , u'http://www.opinion.com.bo/opinion/rss/el_pais_rss.xml' )
|
||||
,(u'Cochabamba' , u'http://www.opinion.com.bo/opinion/rss/cochabamba_rss.xml' )
|
||||
,(u'Economía' , u'http://www.opinion.com.bo/opinion/rss/economia_rss.xml' )
|
||||
,(u'Cultura' , u'http://www.opinion.com.bo/opinion/rss/cultura_rss.xml' )
|
||||
,(u'Mundo' , u'http://www.opinion.com.bo/opinion/rss/mundo_rss.xml' )
|
||||
,(u'Ciencia y Tecnología', u'http://www.opinion.com.bo/opinion/rss/ciencia_tecnologia_rss.xml' )
|
||||
,(u'Policial' , u'http://www.opinion.com.bo/opinion/rss/policial_rss.xml' )
|
||||
,(u'Editorial' , u'http://www.opinion.com.bo/opinion/rss/editorial_rss.xml' )
|
||||
,(u'Subeditorial' , u'http://www.opinion.com.bo/opinion/rss/subeditorial_rss.xml' )
|
||||
,(u'Opinión' , u'http://www.opinion.com.bo/opinion/rss/opinion_rss.xml' )
|
||||
,(u'Deportes' , u'http://www.opinion.com.bo/opinion/rss/deportes_rss.xml')
|
||||
,(u' Vida de hoy' , u'http://www.opinion.com.bo/opinion/rss/vidadehoy_rss.xml' )
|
||||
]
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
for item in soup.findAll(style=True):
|
||||
del item['style']
|
||||
return soup
|
||||
|
||||
# Filter out today's articles
|
||||
# maybe should take timezone into account
|
||||
|
||||
today = strftime('/%Y/%m%d/')
|
||||
def get_article_url(self, article):
|
||||
link = article.link
|
||||
if self.today in link:
|
||||
return link
|
196
recipes/oreilly_premium.recipe
Normal file
196
recipes/oreilly_premium.recipe
Normal file
@ -0,0 +1,196 @@
|
||||
# Talking Points is not grabbing everything.
|
||||
# The look is right, but only the last one added?
|
||||
import re
|
||||
import time
|
||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||
# Allows the Python soup converter, which makes parsing easier.
|
||||
from calibre.ebooks.BeautifulSoup import BeautifulSoup
|
||||
# strip ads and graphics
|
||||
# Current Column lacks a title.
|
||||
# Talking Points Memo - shorten title - Remove year and Bill's name
|
||||
# The News letter archive https://www.billoreilly.com/newsletterarchive is covered by other entries.
|
||||
# Newsletters: Talking Points Memos covered by cat12
|
||||
|
||||
class OReillyPremium(BasicNewsRecipe):
|
||||
title = u'OReilly Premium'
|
||||
__author__ = 'TMcN'
|
||||
description = 'Retrieves Premium and News Letter content from BillOReilly.com. Requires a Bill OReilly Premium Membership.'
|
||||
cover_url = 'http://images.billoreilly.com/images/headers/billgray_header.png'
|
||||
auto_cleanup = True
|
||||
encoding = 'utf8'
|
||||
needs_subscription = True
|
||||
no_stylesheets = True
|
||||
oldest_article = 20
|
||||
remove_javascript = True
|
||||
remove_tags = [dict(name='img', attrs={})]
|
||||
# Don't go down
|
||||
recursions = 0
|
||||
max_articles_per_feed = 2000
|
||||
|
||||
debugMessages = True
|
||||
|
||||
# Name, URL, Soup FindAll Attr if relevant (last two are special case), articleList
|
||||
catList = [ ["TV Archives", 'https://www.billoreilly.com/show?action=tvShowArchive', 'a', {'class':['showLinks','homeLinks']}, []],
|
||||
["No Spin Archives", 'https://www.billoreilly.com/blog?categoryID=7', True, {'class':['blogBody'], 'style':['padding-top:10px;']}, []],
|
||||
["Daily Briefings", 'http://www.billoreilly.com/blog?categoryID=11', True, {'class':['defaultHeaderSmallLinks']}, []],
|
||||
["Stratfor", 'http://www.billoreilly.com/blog?categoryID=5', 'a', {'class':['blogLinks']}, []],
|
||||
["Talking Points Memo", 'https://www.billoreilly.com/blog?categoryID=12', 'td', {}, []],
|
||||
["Current Column", 'https://www.billoreilly.com/currentcolumn', 'span', {'class':['defaultHeader']}, []]
|
||||
]
|
||||
|
||||
def get_browser(self):
|
||||
br = BasicNewsRecipe.get_browser()
|
||||
if self.username is not None and self.password is not None:
|
||||
br.open('https://www.billoreilly.com/pg/jsp/member/membersignin.jsp')
|
||||
br.select_form(name='login')
|
||||
br['formEmailField'] = self.username
|
||||
br['formPasswordField'] = self.password
|
||||
br.submit()
|
||||
return br
|
||||
|
||||
# Returns the best-guess print url.
|
||||
# The second parameter (pageURL) is returned if nothing is found.
|
||||
def extractPrintURL(self, baseURL, pageURL, printString):
|
||||
tagURL = pageURL
|
||||
soup = self.index_to_soup(pageURL)
|
||||
if soup :
|
||||
printText = soup.find('a', text=printString)
|
||||
else :
|
||||
print("Failed to find Print string "+printString+ " in "+pageURL)
|
||||
if printText:
|
||||
tag = printText.parent
|
||||
tagURL = baseURL+tag['href']
|
||||
return tagURL
|
||||
|
||||
def stripBadChars(self, inString) :
|
||||
return inString.replace("\'", "")
|
||||
|
||||
def parseGeneric(self, baseURL):
|
||||
# Does a generic parsing of the articles. There are six categories (0-5)
|
||||
# Name, URL, Soup FindAll Attr if relevant (last two are special case), articleList
|
||||
# NoSpin and TV are generic
|
||||
fullReturn = []
|
||||
for i in range(len(self.catList)) :
|
||||
articleList = []
|
||||
soup = self.index_to_soup(self.catList[i][1])
|
||||
# Set defaults
|
||||
description = 'None'
|
||||
pubdate = time.strftime('%a, %d %b')
|
||||
# Problem: 0-2 create many in an array
|
||||
# 3-5 create one.
|
||||
# So no for-div for 3-5
|
||||
|
||||
if i < 3 :
|
||||
for div in soup.findAll(self.catList[i][2], self.catList[i][3]):
|
||||
print(div)
|
||||
if i == 1:
|
||||
a = div.find('a', href=True)
|
||||
else :
|
||||
a = div
|
||||
print(a)
|
||||
summary = div.find(True, attrs={'class':'summary'})
|
||||
if summary:
|
||||
description = self.tag_to_string(summary, use_alt=False)
|
||||
if not a:
|
||||
continue
|
||||
# url = baseURL+re.sub(r'\?.*', '', a['href'])
|
||||
url = baseURL+a['href']
|
||||
if i < 2 :
|
||||
url = self.extractPrintURL(baseURL, url, "Print this entry")
|
||||
title = self.tag_to_string(a, use_alt=True).strip()
|
||||
elif i == 2 :
|
||||
# Daily Briefs
|
||||
url = self.extractPrintURL(baseURL, url, "Print this entry")
|
||||
title = div.contents[0]
|
||||
if self.debugMessages :
|
||||
print(title+" @ "+url)
|
||||
articleList.append(dict(title=title, url=url, date=pubdate, description=description, content=''))
|
||||
|
||||
elif i == 3 : # Stratfor
|
||||
a = soup.find('a', self.catList[i][3])
|
||||
if a is None :
|
||||
continue
|
||||
url = baseURL+a['href']
|
||||
title = self.tag_to_string(a, use_alt=True).strip()
|
||||
# Get Stratfor contents so we can get the real title.
|
||||
stratSoup = self.index_to_soup(url)
|
||||
title = stratSoup.html.head.title.string
|
||||
stratIndex = title.find('Stratfor.com:', 0)
|
||||
if (stratIndex > -1) :
|
||||
title = title[stratIndex+14:-1]
|
||||
# Look for first blogBody <td class="blogBody"
|
||||
# Changed 12 Jan 2012 - new page format
|
||||
#stratBlogTable = stratSoup.find('td', {'class':['blogBody']}).findParent('table')
|
||||
#stratBody = stratSoup.find('td', {'class':['blogBody']})
|
||||
elif i == 4 : # Talking Points
|
||||
topDate = soup.find("td", "blogBody")
|
||||
if not topDate :
|
||||
print("Failed to find date in Talking Points")
|
||||
# This page has the contents in double-wrapped tables!
|
||||
myTable = topDate.findParents('table')[0]
|
||||
if myTable is not None:
|
||||
upOneTable = myTable.findParents('table')[0]
|
||||
if upOneTable is not None:
|
||||
upTwo = upOneTable.findParents('table')[0]
|
||||
if upTwo is None:
|
||||
continue
|
||||
# Now navigate rows of upTwo
|
||||
if self.debugMessages :
|
||||
print("Entering rows")
|
||||
for rows in upTwo.findChildren("tr", recursive=False):
|
||||
# Inside top level table, each row is an article
|
||||
rowTable = rows.find("table")
|
||||
articleTable = rowTable.find("table")
|
||||
# This looks wrong.
|
||||
articleTable = rows.find("tr")
|
||||
# The middle table is just for formatting the article buffer... but this means we can skip the inner table.
|
||||
blogDate = articleTable.find("a","blogDate").contents[0]
|
||||
# Skip to second blogBody for this.
|
||||
blogTitle = articleTable.findAll("td", "blogBody")[1].contents[0]
|
||||
blogURL = articleTable.find("a", "homeBlogReadMore bold")['href']
|
||||
url = baseURL+re.sub(r'\?.*', '', blogURL)
|
||||
title = blogDate+": "+self.stripBadChars(blogTitle.replace("Bill O'Reilly: ", ""))
|
||||
if self.debugMessages :
|
||||
print("Talking Points Memo title "+title+" at url: "+url)
|
||||
pubdate = time.strftime('%a, %d %b')
|
||||
articleList.append(dict(title=title, url=url, date=pubdate, description='None', content=''))
|
||||
else : # Current Column
|
||||
titleSpan = soup.find(self.catList[i][2], self.catList[i][3])
|
||||
if titleSpan is None :
|
||||
continue
|
||||
title = titleSpan.contents[0]
|
||||
url = self.extractPrintURL(baseURL, self.catList[i][1], "Print This Article")
|
||||
if i == 3 or i == 5 :
|
||||
if self.debugMessages :
|
||||
print(self.catList[i][0]+" Title:"+title+" at url: "+url)
|
||||
summary = div.find(True, attrs={'class':'summary'})
|
||||
if summary:
|
||||
description = self.tag_to_string(summary, use_alt=False)
|
||||
articleList.append(dict(title=title, url=url, date=pubdate, description=description, content=''))
|
||||
self.catList[i][3] = articleList
|
||||
fullReturn.append((self.catList[i][0], articleList))
|
||||
return fullReturn
|
||||
|
||||
# calibre.web.feeds.news.BasicNewsRecipe.parse_index() fetches the list of articles.
|
||||
# returns a list of tuple ('feed title', list of articles)
|
||||
# {
|
||||
# 'title' : article title,
|
||||
# 'url' : URL of print version,
|
||||
# 'date' : The publication date of the article as a string,
|
||||
# 'description' : A summary of the article
|
||||
# 'content' : The full article (can be an empty string). This is used by FullContentProfile
|
||||
# }
|
||||
# this is used instead of BasicNewsRecipe.parse_feeds().
|
||||
def parse_index(self):
|
||||
# Parse the page into Python Soup
|
||||
baseURL = "https://www.billoreilly.com"
|
||||
return self.parseGeneric(baseURL)
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
refresh = soup.find('meta', {'http-equiv':'refresh'})
|
||||
if refresh is None:
|
||||
return soup
|
||||
content = refresh.get('content').partition('=')[2]
|
||||
raw = self.browser.open('https://www.billoreilly.com'+content).read()
|
||||
return BeautifulSoup(raw.decode('cp1252', 'replace'))
|
||||
|
@ -1,12 +1,10 @@
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2008-2010, Darko Miletic <darko.miletic at gmail.com>'
|
||||
__copyright__ = '2008-2012, Darko Miletic <darko.miletic at gmail.com>'
|
||||
'''
|
||||
pagina12.com.ar
|
||||
'''
|
||||
|
||||
import re
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
from calibre.ebooks.BeautifulSoup import BeautifulSoup
|
||||
|
||||
class Pagina12(BasicNewsRecipe):
|
||||
title = 'Pagina - 12'
|
||||
@ -66,9 +64,7 @@ class Pagina12(BasicNewsRecipe):
|
||||
return url.replace('http://www.pagina12.com.ar/','http://www.pagina12.com.ar/imprimir/')
|
||||
|
||||
def get_cover_url(self):
|
||||
rawc = self.index_to_soup('http://www.pagina12.com.ar/diario/principal/diario/index.html',True)
|
||||
rawc2 = re.sub(r'PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN','PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"',rawc)
|
||||
soup = BeautifulSoup(rawc2,fromEncoding=self.encoding,smartQuotesTo=None)
|
||||
soup = self.index_to_soup('http://www.pagina12.com.ar/diario/principal/diario/index.html')
|
||||
for image in soup.findAll('img',alt=True):
|
||||
if image['alt'].startswith('Tapa de la fecha'):
|
||||
return image['src']
|
||||
|
14
recipes/pambianco.recipe
Normal file
14
recipes/pambianco.recipe
Normal file
@ -0,0 +1,14 @@
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class AdvancedUserRecipe1326135591(BasicNewsRecipe):
|
||||
title = u'Pambianco'
|
||||
description = 'fashion magazine for professional people'
|
||||
language = 'it'
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 100
|
||||
auto_cleanup = True
|
||||
|
||||
feeds = [(u'Pambianco', u'http://feeds.feedburner.com/pambianconews/YGXu')]
|
||||
__author__ = 'faber1971'
|
||||
__version__ = 'v1.0'
|
||||
__date__ = '9, January 2011'
|
@ -33,3 +33,6 @@ class BasicUserRecipe1314970845(BasicNewsRecipe):
|
||||
(u'Obituaries', u'http://www.philly.com/inquirer_obituaries.rss')
|
||||
]
|
||||
|
||||
def print_version(self, url):
|
||||
return url + '?viewAll=y'
|
||||
|
||||
|
79
recipes/prospectmaguk.recipe
Normal file
79
recipes/prospectmaguk.recipe
Normal file
@ -0,0 +1,79 @@
|
||||
#!/usr/bin/env python
|
||||
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
|
||||
'''
|
||||
calibre recipe for prospectmagazine.co.uk (subscription)
|
||||
'''
|
||||
|
||||
import re
|
||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||
|
||||
class ProspectMagUK(BasicNewsRecipe):
|
||||
title = u'Prospect Magazine'
|
||||
description = 'A general-interest publication offering analysis and commentary about politics, news and business.'
|
||||
__author__ = 'barty, duluoz'
|
||||
timefmt = ' [%d %B %Y]'
|
||||
no_stylesheets = True
|
||||
publication_type = 'magazine'
|
||||
masthead_url = 'http://www.prospectmagazine.co.uk/wp-content/themes/prospect/images/titleMain.jpg'
|
||||
category = 'news, UK'
|
||||
language = 'en_GB'
|
||||
max_articles_per_feed = 100
|
||||
auto_cleanup = True
|
||||
needs_subscription = True
|
||||
|
||||
auto_cleanup_keep = '//div[@class="lead_image"]'
|
||||
remove_tags = [{'class':['shareinpost','postutils','postinfo']}]
|
||||
|
||||
INDEX = 'http://www.prospectmagazine.co.uk/current-issue'
|
||||
|
||||
def get_browser(self):
|
||||
br = BasicNewsRecipe.get_browser()
|
||||
if self.username is not None and self.password is not None:
|
||||
br.open('http://www.prospectmagazine.co.uk/wp-login.php')
|
||||
br.select_form(name='loginform')
|
||||
br['log'] = self.username
|
||||
br['pwd'] = self.password
|
||||
br.submit()
|
||||
return br
|
||||
|
||||
def parse_index(self):
|
||||
soup = self.index_to_soup(self.INDEX)
|
||||
#div = soup.find('h1',text=re.compile(r'Issue \d+'))
|
||||
#fname = self.tag_to_string( div) if div is not None else 'Current Issue'
|
||||
div = soup.find('div', id='cover_image')
|
||||
if div is not None:
|
||||
img = div.find('img', src=True)
|
||||
if img is not None:
|
||||
src = img['src']
|
||||
if src.startswith('/'):
|
||||
src = 'http://www.prospectmagazine.co.uk' + src
|
||||
self.cover_url = src
|
||||
feeds = []
|
||||
# loop through sections
|
||||
for sect in soup.findAll('div',attrs={'class':'sectionheading'}):
|
||||
fname = self.tag_to_string( sect).replace('>','').strip()
|
||||
self.log('Found section', fname)
|
||||
articles = []
|
||||
|
||||
# note: can't just find siblings with class='post' because that will also
|
||||
# grab all the articles belonging to the sections that follow.
|
||||
for item in sect.findNextSiblings('div',attrs={'class':True}):
|
||||
if not 'post' in item['class']: break
|
||||
a = item.find('a', href=True)
|
||||
if a is None: continue
|
||||
url = a['href']
|
||||
title = self.tag_to_string(a)
|
||||
p = item.find('p')
|
||||
desc = self.tag_to_string( p) if p is not None else ''
|
||||
art = {'title':title, 'description':desc,'date':' ', 'url':url}
|
||||
p = item.find(attrs={'class':re.compile('author')})
|
||||
self.log('\tFound article:', title, '::', url)
|
||||
if p is not None:
|
||||
art['author'] = self.tag_to_string( p).strip()
|
||||
articles.append(art)
|
||||
|
||||
feeds.append((fname, articles))
|
||||
return feeds
|
65
recipes/rionegro.recipe
Normal file
65
recipes/rionegro.recipe
Normal file
@ -0,0 +1,65 @@
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2012, Darko Miletic <darko.miletic at gmail.com>'
|
||||
'''
|
||||
www.rionegro.com.ar
|
||||
'''
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class RioNegro(BasicNewsRecipe):
|
||||
title = 'Diario Rio Negro'
|
||||
__author__ = 'Darko Miletic'
|
||||
description = 'Noticias desde la Patagonia Argentina y el resto del mundo'
|
||||
publisher = 'Editorial Rio Negro SA.'
|
||||
category = 'news, politics, Argentina'
|
||||
oldest_article = 2
|
||||
max_articles_per_feed = 200
|
||||
no_stylesheets = True
|
||||
encoding = 'utf8'
|
||||
use_embedded_content = False
|
||||
language = 'es_AR'
|
||||
remove_empty_feeds = True
|
||||
publication_type = 'newspaper'
|
||||
masthead_url = 'http://www.rionegro.com.ar/diario/imagenes/logorn.gif'
|
||||
extra_css = """
|
||||
body{font-family: Arial,Helvetica,sans-serif }
|
||||
img{display:block}
|
||||
h1 {font-size: 0.89em; color: red}
|
||||
h2 {font-family: Georgia,"Times New Roman",Times,serif; font-size: 1.8em}
|
||||
h3 {font-family: Georgia,"Times New Roman",Times,serif; border-bottom: 2px solid gray}
|
||||
"""
|
||||
|
||||
conversion_options = {
|
||||
'comment' : description
|
||||
, 'tags' : category
|
||||
, 'publisher' : publisher
|
||||
, 'language' : language
|
||||
}
|
||||
|
||||
remove_tags = [
|
||||
dict(name=['meta','link','iframe','object','embed'])
|
||||
,dict(name='div', attrs={'class':'logo'})
|
||||
]
|
||||
keep_only_tags=[dict(attrs={'class':'nota'})]
|
||||
remove_attributes=['lang']
|
||||
|
||||
|
||||
feeds = [
|
||||
(u'Argentina' , u'http://www.rionegro.com.ar/diario/funciones/xml/rss.aspx?idcat=9532')
|
||||
,(u'El Mundo' , u'http://www.rionegro.com.ar/diario/funciones/xml/rss.aspx?idcat=9533')
|
||||
,(u'Carta de lectores', u'http://www.rionegro.com.ar/diario/funciones/xml/rss.aspx?idcat=9538')
|
||||
,(u'Columnistas' , u'http://www.rionegro.com.ar/diario/funciones/xml/rss.aspx?idcat=9539')
|
||||
,(u'Domingo a Domingo', u'http://www.rionegro.com.ar/diario/funciones/xml/rss.aspx?idcat=9541')
|
||||
,(u'Editorial' , u'http://www.rionegro.com.ar/diario/funciones/xml/rss.aspx?idcat=9542')
|
||||
,(u'Deportes' , u'http://www.rionegro.com.ar/diario/funciones/xml/rss.aspx?idcat=9522')
|
||||
,(u'Espectaculos' , u'http://www.rionegro.com.ar/diario/funciones/xml/rss.aspx?idcat=9523')
|
||||
,(u'Sociedad' , u'http://www.rionegro.com.ar/diario/funciones/xml/rss.aspx?idcat=9574')
|
||||
,(u'Policiales' , u'http://www.rionegro.com.ar/diario/funciones/xml/rss.aspx?idcat=9525')
|
||||
,(u'Municipales' , u'http://www.rionegro.com.ar/diario/funciones/xml/rss.aspx?idcat=9862')
|
||||
,(u'Region' , u'http://www.rionegro.com.ar/diario/funciones/xml/rss.aspx?idcat=9701')
|
||||
]
|
||||
|
||||
def print_version(self, url):
|
||||
idart_raw = url.rpartition('idart=')[2]
|
||||
idart = idart_raw.rpartition('&')[0]
|
||||
return 'http://www.rionegro.com.ar/diario/rn/print.aspx?idArt=' + idart + '&tipo=2'
|
17
recipes/salonica_press_news.recipe
Normal file
17
recipes/salonica_press_news.recipe
Normal file
@ -0,0 +1,17 @@
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class spn(BasicNewsRecipe):
|
||||
title = u'Salonica Press News'
|
||||
language = 'gr'
|
||||
__author__ = "SteliosGero"
|
||||
oldest_article = 3
|
||||
max_articles_per_feed = 100
|
||||
auto_cleanup = True
|
||||
category = 'news, GR'
|
||||
language = 'el'
|
||||
|
||||
|
||||
feeds = [(u'\u03a0\u03bf\u03bb\u03b9\u03c4\u03b9\u03ba\u03ae', u'http://www.spnews.gr/politiki?format=feed&type=rss'), (u'\u039f\u03b9\u03ba\u03bf\u03bd\u03bf\u03bc\u03af\u03b1', u'http://www.spnews.gr/oikonomia?format=feed&type=rss'), (u'\u0391\u03c5\u03c4\u03bf\u03b4\u03b9\u03bf\u03af\u03ba\u03b7\u03c3\u03b7', u'http://www.spnews.gr/aftodioikisi?format=feed&type=rss'), (u'\u039a\u03bf\u03b9\u03bd\u03c9\u03bd\u03af\u03b1', u'http://www.spnews.gr/koinonia?format=feed&type=rss'), (u'\u0391\u03b8\u03bb\u03b7\u03c4\u03b9\u03c3\u03bc\u03cc\u03c2', u'http://www.spnews.gr/sports?format=feed&type=rss'), (u'\u0394\u03b9\u03b5\u03b8\u03bd\u03ae', u'http://www.spnews.gr/diethni?format=feed&type=rss'), (u'\u03a0\u03bf\u03bb\u03b9\u03c4\u03b9\u03c3\u03bc\u03cc\u03c2', u'http://www.spnews.gr/politismos?format=feed&type=rss'), (u'Media', u'http://www.spnews.gr/media-news?format=feed&type=rss'), (u'\u0396\u03c9\u03ae', u'http://www.spnews.gr/zoi?format=feed&type=rss'), (u'\u03a4\u03b5\u03c7\u03bd\u03bf\u03bb\u03bf\u03b3\u03af\u03b1', u'http://spnews.gr/texnologia?format=feed&type=rss'), (u'\u03a0\u03b5\u03c1\u03b9\u03b2\u03ac\u03bb\u03bb\u03bf\u03bd', u'http://spnews.gr/periballon?format=feed&type=rss'), (u'\u03a0\u03b1\u03c1\u03b1\u03c0\u03bf\u03bb\u03b9\u03c4\u03b9\u03ba\u03ac', u'http://spnews.gr/parapolitika?format=feed&type=rss'), (u'\u03a0\u03b1\u03c1\u03b1\u03b4\u03b7\u03bc\u03bf\u03c4\u03b9\u03ba\u03ac', u'http://spnews.gr/paradimotika?format=feed&type=rss'), (u'\u03a0\u03b1\u03c1\u03b1\u03b1\u03b8\u03bb\u03b7\u03c4\u03b9\u03ba\u03ac', u'http://spnews.gr/parathlitika?format=feed&type=rss'), (u'\u0391\u03c0\u03cc\u03c8\u03b5\u03b9\u03c2', u'http://spnews.gr/apopseis?format=feed&type=rss'), (u'\u03a3\u03c5\u03bd\u03b5\u03cd\u03be\u03b5\u03b9\u03c2', u'http://spnews.gr/synenteykseis?format=feed&type=rss'), (u'Alert!', u'http://spnews.gr/alert?format=feed&type=rss')]
|
||||
|
||||
def print_version(self, url):
|
||||
return url+'?tmpl=component&print=1&layout=default&page='
|
@ -12,6 +12,7 @@ class MercuryNews(BasicNewsRecipe):
|
||||
title = 'San Jose Mercury News'
|
||||
__author__ = 'Darko Miletic'
|
||||
description = 'News from San Jose'
|
||||
cover_url = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg12/lg/CA_SJMN.jpg'
|
||||
publisher = 'San Jose Mercury News'
|
||||
category = 'news, politics, USA, San Jose, California'
|
||||
oldest_article = 2
|
||||
|
@ -12,6 +12,7 @@ class SeattleTimes(BasicNewsRecipe):
|
||||
title = 'The Seattle Times'
|
||||
__author__ = 'Darko Miletic'
|
||||
description = 'News from Seattle and USA'
|
||||
cover_url = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg12/lg/WA_ST.jpg'
|
||||
publisher = 'The Seattle Times'
|
||||
category = 'news, politics, USA'
|
||||
oldest_article = 2
|
||||
@ -20,6 +21,8 @@ class SeattleTimes(BasicNewsRecipe):
|
||||
use_embedded_content = False
|
||||
encoding = 'cp1252'
|
||||
language = 'en'
|
||||
auto_cleanup = True
|
||||
auto_cleanup_keep = '//div[@id="PhotoContainer"]'
|
||||
|
||||
feeds = [
|
||||
(u'Top Stories',
|
||||
@ -69,24 +72,4 @@ class SeattleTimes(BasicNewsRecipe):
|
||||
u'http://seattletimes.nwsource.com/rss/mostreadarticles.xml'),
|
||||
]
|
||||
|
||||
keep_only_tags = [dict(id='content')]
|
||||
remove_tags = [
|
||||
dict(name=['object','link','script']),
|
||||
{'class':['permission', 'note', 'bottomtools',
|
||||
'homedelivery']},
|
||||
dict(id=["rightcolumn", 'footer', 'adbottom']),
|
||||
]
|
||||
|
||||
def print_version(self, url):
|
||||
return url
|
||||
start_url, sep, rest_url = url.rpartition('_')
|
||||
rurl, rsep, article_id = start_url.rpartition('/')
|
||||
return u'http://seattletimes.nwsource.com/cgi-bin/PrintStory.pl?document_id=' + article_id
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
mtag = '<meta http-equiv="Content-Language" content="en-US"/>'
|
||||
soup.head.insert(0,mtag)
|
||||
for item in soup.findAll(style=True):
|
||||
del item['style']
|
||||
return soup
|
||||
|
||||
|
491
recipes/singtaohk.recipe
Normal file
491
recipes/singtaohk.recipe
Normal file
@ -0,0 +1,491 @@
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2011, Eddie Lau'
|
||||
|
||||
# data source: normal, mobile
|
||||
__Source__ = 'mobile'
|
||||
# please replace the following "True" with "False". (Default: True)
|
||||
__MakePeriodical__ = True
|
||||
# Turn below to True if your device supports display of CJK titles (Default: False)
|
||||
__UseChineseTitle__ = False
|
||||
# Set it to False if you want to skip images (Default: True)
|
||||
__KeepImages__ = True
|
||||
# Set it to True if you want to include a summary in Kindle's article view (Default: False)
|
||||
__IncludeSummary__ = False
|
||||
# Set it to True if you want thumbnail images in Kindle's article view (Default: True)
|
||||
__IncludeThumbnails__ = True
|
||||
|
||||
|
||||
'''
|
||||
Change Log:
|
||||
2011/12/29 -- first version done
|
||||
TODO:
|
||||
* use alternative source at http://m.singtao.com/index.php
|
||||
'''
|
||||
|
||||
from calibre.utils.date import now as nowf
|
||||
import os, datetime, re
|
||||
from datetime import date
|
||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||
from contextlib import nested
|
||||
from calibre.ebooks.BeautifulSoup import BeautifulSoup
|
||||
from calibre.ebooks.metadata.opf2 import OPFCreator
|
||||
from calibre.ebooks.metadata.toc import TOC
|
||||
from calibre.ebooks.metadata import MetaInformation
|
||||
from calibre.utils.localization import canonicalize_lang
|
||||
|
||||
# MAIN CLASS
|
||||
class STHKRecipe(BasicNewsRecipe):
|
||||
if __UseChineseTitle__ == True:
|
||||
title = u'\u661f\u5cf6\u65e5\u5831 (\u9999\u6e2f)'
|
||||
else:
|
||||
title = 'Sing Tao Daily - Hong Kong'
|
||||
description = 'Hong Kong Chinese Newspaper (http://singtao.com)'
|
||||
category = 'Chinese, News, Hong Kong'
|
||||
extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px; max-height:90%;} td[class=caption] {font-size:50%;} td[class=bodyhead]{font-weight:bold; font-size:150%;} td[class=stmobheadline]{font-weight:bold; font-size:150%;}'
|
||||
masthead_url = 'http://upload.wikimedia.org/wikipedia/en/d/dd/Singtao-usa.png'
|
||||
if __Source__ == 'normal':
|
||||
keep_only_tags = [dict(name='td', attrs={'class':['bodyhead','bodytext']})]
|
||||
else:
|
||||
keep_only_tags = [dict(name='td', attrs={'class':['stmobheadline']}),
|
||||
dict(name='img', attrs={'width':['146']}),
|
||||
dict(name='td', attrs={'class':['bodytextg']}),
|
||||
]
|
||||
if __KeepImages__:
|
||||
remove_tags = [dict(name='hr')]
|
||||
else:
|
||||
remove_tags = [dict(name='hr'), dict(name='img')]
|
||||
remove_attributes = ['align']
|
||||
preprocess_regexps = [
|
||||
(re.compile(r'<font class="bodytext">', re.DOTALL|re.IGNORECASE),
|
||||
lambda match: '<br><br><font class="bodytext">'),
|
||||
]
|
||||
|
||||
oldest_article = 1
|
||||
max_articles_per_feed = 200
|
||||
__author__ = 'Eddie Lau'
|
||||
publisher = 'Sing Tao Ltd.'
|
||||
remove_javascript = True
|
||||
use_embedded_content = False
|
||||
no_stylesheets = True
|
||||
language = 'zh'
|
||||
encoding = 'Big5-HKSCS'
|
||||
recursions = 0
|
||||
conversion_options = {'linearize_tables':True}
|
||||
timefmt = ''
|
||||
auto_cleanup = False
|
||||
|
||||
def get_dtlocal(self):
|
||||
dt_utc = datetime.datetime.utcnow()
|
||||
# convert UTC to local hk time - at HKT 4.00am, all news are available
|
||||
dt_local = dt_utc + datetime.timedelta(8.0/24) - datetime.timedelta(4.0/24)
|
||||
return dt_local
|
||||
|
||||
def get_fetchdate(self):
|
||||
return self.get_dtlocal().strftime("%Y%m%d")
|
||||
|
||||
def get_fetchformatteddate(self):
|
||||
return self.get_dtlocal().strftime("%Y-%m-%d")
|
||||
|
||||
def get_fetchyear(self):
|
||||
return self.get_dtlocal().strftime("%Y")
|
||||
|
||||
def get_fetchmonth(self):
|
||||
return self.get_dtlocal().strftime("%m")
|
||||
|
||||
def get_fetchday(self):
|
||||
return self.get_dtlocal().strftime("%d")
|
||||
|
||||
def get_cover_url(self):
|
||||
#cover = 'http://singtao.com/media/a/a(2660).jpg' # for 2011/12/29
|
||||
base = 2660
|
||||
todaydate = date(int(self.get_fetchyear()), int(self.get_fetchmonth()), int(self.get_fetchday()))
|
||||
diff = todaydate - date(2011, 12, 29)
|
||||
base = base + int(diff.total_seconds()/(3600*24))
|
||||
cover = 'http://singtao.com/media/a/a(' + str(base) +').jpg'
|
||||
br = BasicNewsRecipe.get_browser()
|
||||
try:
|
||||
br.open(cover)
|
||||
except:
|
||||
cover = 'http://singtao.com/images/stlogo.gif'
|
||||
return cover
|
||||
|
||||
def parse_index(self):
|
||||
feeds = []
|
||||
dateStr = self.get_fetchdate()
|
||||
dateStr
|
||||
|
||||
if __Source__ == 'normal':
|
||||
# single-item section
|
||||
for title, url in [(u'\u793e\u8ad6 Editorial', 'http://singtao.com/yesterday/jou/j_index.html')]:
|
||||
article = self.parse_singleitem_section(url)
|
||||
if article:
|
||||
feeds.append((title, article))
|
||||
|
||||
# multiple items
|
||||
# for title, url in [(u'\u8981\u805e\u6e2f\u805e Local', 'http://singtao.com/yesterday/loc/a_index.html'),
|
||||
# (u'\u8ca1\u7d93 Finance', 'http://singtao.com/yesterday/fin/d_index.html'),
|
||||
# (u'\u5730\u7522 Properties', 'http://singtao.com/yesterday/pro/h_index.html'),
|
||||
# (u'\u6559\u80b2 Education', 'http://singtao.com/yesterday/edu/g_index.asp'),
|
||||
# (u'\u5a1b\u6a02 Entertainment', 'http://singtao.com/yesterday/ent/f_index.html'),
|
||||
# (u'\u9ad4\u80b2 Sports', 'http://singtao.com/yesterday/spo/c_index.html'),
|
||||
# (u'\u99ac\u7d93 Horse Racing', 'http://singtao.com/yesterday/rac/n_index.html')
|
||||
# ]:
|
||||
# articles = self.parse_section(url)
|
||||
# if articles:
|
||||
# feeds.append((title, articles))
|
||||
|
||||
# special: supplement
|
||||
# for title, url, baseurl in [(u'\u526f\u520a Supplements', 'http://singtao.com/yesterday/sup/m_index.html', '/')]:
|
||||
# articles = self.parse_section_withouttext(url, baseurl)
|
||||
# if articles:
|
||||
# feeds.append((title, articles))
|
||||
|
||||
# multiple-item sections
|
||||
# for title, url in [(u'\u570b\u969b World', 'http://singtao.com/yesterday/int/b_index.html'),
|
||||
# (u'\u4e2d\u570b China', 'http://singtao.com/yesterday/chi/e_index.html')
|
||||
# ]:
|
||||
# articles = self.parse_section(url)
|
||||
# if articles:
|
||||
# feeds.append((title, articles))
|
||||
|
||||
for title, url, baseurl in [(u'\u8981\u805e\u6e2f\u805e Local', 'http://singtao.com/yesterday/loc/a_index.html', '/'),
|
||||
(u'\u8ca1\u7d93 Finance', 'http://singtao.com/yesterday/fin/d_index.html', '/'),
|
||||
(u'\u5730\u7522 Properties', 'http://singtao.com/yesterday/pro/h_index.html', '/'),
|
||||
(u'\u6559\u80b2 Education', 'http://singtao.com/yesterday/edu/g_index.asp', '/'),
|
||||
(u'\u5a1b\u6a02 Entertainment', 'http://singtao.com/yesterday/ent/f_index.html', '/'),
|
||||
(u'\u9ad4\u80b2 Sports', 'http://singtao.com/yesterday/spo/c_index.html', '/'),
|
||||
(u'\u99ac\u7d93 Horse Racing', 'http://singtao.com/yesterday/rac/n_index.html', '/'),
|
||||
(u'\u526f\u520a Supplements', 'http://singtao.com/yesterday/sup/m_index.html', '/'),
|
||||
(u'\u570b\u969b World', 'http://singtao.com/yesterday/int/b_index.html', '/'),
|
||||
(u'\u4e2d\u570b China', 'http://singtao.com/yesterday/chi/e_index.html', '/')]:
|
||||
articles = self.parse_section_withouttext(url, baseurl)
|
||||
if articles:
|
||||
feeds.append((title, articles))
|
||||
else: # use mobile
|
||||
# single-item section
|
||||
for title, url in [(u'\u793e\u8ad6 Editorial', 'http://m.singtao.com/showContent.php?main=paper&sub=0&title=0')]:
|
||||
article = self.parse_singleitem_section_m(url)
|
||||
if article:
|
||||
feeds.append((title, article))
|
||||
# multiple-item section
|
||||
for title, url, baseurl in [(u'\u8981\u805e\u6e2f\u805e Local', 'http://m.singtao.com/showTitle.php?main=paper&sub=1', 'http://m.singtao.com/'),
|
||||
(u'\u8ca1\u7d93 Finance', 'http://m.singtao.com/showTitle.php?main=paper&sub=2', 'http://m.singtao.com/'),
|
||||
(u'\u5730\u7522 Properties', 'http://m.singtao.com/showTitle.php?main=paper&sub=3', 'http://m.singtao.com/'),
|
||||
(u'\u6559\u80b2 Education', 'http://m.singtao.com/showTitle.php?main=paper&sub=4', 'http://m.singtao.com/'),
|
||||
(u'\u5a1b\u6a02 Entertainment', 'http://m.singtao.com/showTitle.php?main=paper&sub=5', 'http://m.singtao.com/'),
|
||||
(u'\u99ac\u7d93 Horse Racing', 'http://m.singtao.com/showTitle.php?main=paper&sub=6', 'http://m.singtao.com/'),
|
||||
(u'\u9ad4\u80b2 Sports', 'http://m.singtao.com/showTitle.php?main=paper&sub=7', 'http://m.singtao.com/'),
|
||||
(u'\u526f\u520a Supplements', 'http://m.singtao.com/showTitle.php?main=paper&sub=8', 'http://m.singtao.com/'),
|
||||
(u'\u570b\u969b World', 'http://m.singtao.com/showTitle.php?main=paper&sub=9', 'http://m.singtao.com/'),
|
||||
(u'\u4e2d\u570b China', 'http://m.singtao.com/showTitle.php?main=paper&sub=10', 'http://m.singtao.com/')]:
|
||||
articles = self.parse_multiitem_section_m(url, baseurl)
|
||||
if articles:
|
||||
feeds.append((title, articles))
|
||||
return feeds
|
||||
|
||||
def parse_singleitem_section(self, url):
|
||||
current_articles = []
|
||||
current_articles.append({'title': '', 'url': url, 'description': '', 'date': ''})
|
||||
return current_articles
|
||||
|
||||
def parse_singleitem_section_m(self, url):
|
||||
current_articles = []
|
||||
current_articles.append({'title': '', 'url': url, 'description': '', 'date': ''})
|
||||
return current_articles
|
||||
|
||||
def parse_section(self, url):
|
||||
soup = self.index_to_soup(url)
|
||||
# find <table width=436 border=0 cellspacing=0 align=center cellpadding=0> tag
|
||||
tables = soup.findAll(name={'table'}, attrs={'width': ['436']})
|
||||
current_articles_all = []
|
||||
for table in tables:
|
||||
divs = table.findAll(name={'a'})
|
||||
current_articles = []
|
||||
included_urls = []
|
||||
for i in divs:
|
||||
title = self.tag_to_string(i)
|
||||
urlstr = i.get('href', False)
|
||||
urlstr = url + '/../' + urlstr
|
||||
if urlstr not in included_urls:
|
||||
current_articles.append({'title': title, 'url': urlstr, 'description': '', 'date': ''})
|
||||
included_urls.append(urlstr)
|
||||
current_articles_all.extend(current_articles)
|
||||
return current_articles_all
|
||||
|
||||
def parse_section_withouttext(self, url, baseurl):
|
||||
soup = self.index_to_soup(url)
|
||||
# find all a tag
|
||||
links = soup.findAll(name={'a'})
|
||||
linksexcluded = soup.findAll(name={'a'}, attrs={'class':'secondhead'})
|
||||
for elink in linksexcluded:
|
||||
links.remove(elink)
|
||||
linksexcluded = soup.findAll(name={'a'}, attrs={'class':'second02'})
|
||||
for elink in linksexcluded:
|
||||
links.remove(elink)
|
||||
current_articles_all = []
|
||||
included_urls = []
|
||||
for link in links:
|
||||
title = self.tag_to_string(link)
|
||||
if len(title.strip()) > 0:
|
||||
urlstr = link.get('href', False)
|
||||
if urlstr.rfind(baseurl) == -1 and urlstr.rfind('mailto:') == -1:
|
||||
urlstr = url + '/../' + urlstr
|
||||
if urlstr not in included_urls:
|
||||
current_articles_all.append({'title': title, 'url': urlstr, 'description': '', 'date': ''})
|
||||
included_urls.append(urlstr)
|
||||
return current_articles_all
|
||||
|
||||
def parse_multiitem_section_m(self, url, baseurl):
|
||||
soup = self.index_to_soup(url)
|
||||
# find all a tag
|
||||
links = soup.findAll(name={'span'}, attrs={'class':'urlurl'})
|
||||
current_articles_all = []
|
||||
included_urls = []
|
||||
for linkraw in links:
|
||||
linkclean = soup.findAll(name={'a'})
|
||||
for link in linkclean:
|
||||
title = self.tag_to_string(link)
|
||||
if len(title.strip()) > 0:
|
||||
urlstr = link.get('href', False)
|
||||
urlstr = baseurl + urlstr
|
||||
if urlstr not in included_urls:
|
||||
current_articles_all.append({'title': title, 'url': urlstr, 'description': '', 'date': ''})
|
||||
included_urls.append(urlstr)
|
||||
return current_articles_all
|
||||
|
||||
def populate_article_metadata(self, article, soup, first):
|
||||
if __Source__ == 'normal':
|
||||
# get title if not fetched in parse_section() function
|
||||
if article.title == '' or len(article.title.strip()) == 0:
|
||||
articletitle = soup.findAll('td',attrs={'class':'bodyhead'})
|
||||
if articletitle:
|
||||
articletitlemod = articletitle[0].find('font')
|
||||
if articletitlemod:
|
||||
article.title = articletitlemod.string.strip()
|
||||
else:
|
||||
article.title = articletitle[0].string.strip()
|
||||
else:
|
||||
# use the title in the text in any case
|
||||
articletitle = soup.findAll('td', attrs={'class':'stmobheadline'})
|
||||
if articletitle:
|
||||
articletitle[0].br.extract()
|
||||
article.title = articletitle[0].contents[0]
|
||||
# get thumbnail image
|
||||
if __IncludeThumbnails__ and first and hasattr(self, 'add_toc_thumbnail'):
|
||||
img = soup.find('img')
|
||||
if img is not None:
|
||||
self.add_toc_thumbnail(article, img['src'])
|
||||
|
||||
try:
|
||||
if __IncludeSummary__ and len(article.text_summary.strip()) == 0:
|
||||
# look for content
|
||||
if __Source__ == 'normal':
|
||||
articlebodies = soup.findAll('font',attrs={'class':'bodytext'})
|
||||
else:
|
||||
articlebodies = soup.findAll('div', attrs={'class':'hkadj'})
|
||||
if articlebodies:
|
||||
for articlebody in articlebodies:
|
||||
if articlebody:
|
||||
# the text may or may not be enclosed in <p></p> tag
|
||||
paras = articlebody.findAll('p')
|
||||
if not paras:
|
||||
paras = articlebody
|
||||
textFound = False
|
||||
for p in paras:
|
||||
if not textFound:
|
||||
summary_candidate = self.tag_to_string(p).strip()
|
||||
if len(summary_candidate) > 0:
|
||||
summary_candidate = summary_candidate.replace(u'(\u661f\u5cf6\u65e5\u5831\u5831\u9053)', '', 1)
|
||||
article.summary = article.text_summary = summary_candidate
|
||||
textFound = True
|
||||
else:
|
||||
# display a simple text
|
||||
#article.summary = article.text_summary = u'\u66f4\u591a......'
|
||||
# display word counts
|
||||
counts = 0
|
||||
if __Source__ == 'normal':
|
||||
articlebodies = soup.findAll('font',attrs={'class':'bodytext'})
|
||||
else:
|
||||
articlebodies = soup.findAll('div', attrs={'class':'hkadj'})
|
||||
if articlebodies:
|
||||
for articlebody in articlebodies:
|
||||
# the text may or may not be enclosed in <p></p> tag
|
||||
paras = articlebody.findAll('p')
|
||||
if not paras:
|
||||
paras = articlebody
|
||||
for p in paras:
|
||||
summary_candidate = self.tag_to_string(p).strip()
|
||||
counts += len(summary_candidate)
|
||||
article.summary = article.text_summary = u'\uff08' + str(counts) + u'\u5b57\uff09'
|
||||
except:
|
||||
self.log("Error creating article descriptions")
|
||||
return
|
||||
|
||||
# override from the one in version 0.8.31
|
||||
def create_opf(self, feeds, dir=None):
|
||||
if dir is None:
|
||||
dir = self.output_dir
|
||||
title = self.short_title()
|
||||
# change 1: allow our own flag to tell if a periodical is to be generated
|
||||
# also use customed date instead of current time
|
||||
if __MakePeriodical__ == False or self.output_profile.periodical_date_in_title:
|
||||
title = title + ' ' + self.get_fetchformatteddate()
|
||||
# end of change 1
|
||||
# change 2: __appname__ replaced by newspaper publisher
|
||||
__appname__ = self.publisher
|
||||
mi = MetaInformation(title, [__appname__])
|
||||
mi.publisher = __appname__
|
||||
mi.author_sort = __appname__
|
||||
# change 3: use __MakePeriodical__ flag to tell if a periodical should be generated
|
||||
if __MakePeriodical__ == True:
|
||||
mi.publication_type = 'periodical:'+self.publication_type+':'+self.short_title()
|
||||
else:
|
||||
mi.publication_type = self.publication_type+':'+self.short_title()
|
||||
#mi.publication_type = 'periodical:'+self.publication_type+':'+self.short_title()
|
||||
# change 4: in the following, all the nowf() are changed to adjusted time
|
||||
# This one doesn't matter
|
||||
mi.timestamp = nowf()
|
||||
# change 5: skip listing the articles
|
||||
#article_titles, aseen = [], set()
|
||||
#for f in feeds:
|
||||
# for a in f:
|
||||
# if a.title and a.title not in aseen:
|
||||
# aseen.add(a.title)
|
||||
# article_titles.append(force_unicode(a.title, 'utf-8'))
|
||||
|
||||
#mi.comments = self.description
|
||||
#if not isinstance(mi.comments, unicode):
|
||||
# mi.comments = mi.comments.decode('utf-8', 'replace')
|
||||
#mi.comments += ('\n\n' + _('Articles in this issue: ') + '\n' +
|
||||
# '\n\n'.join(article_titles))
|
||||
|
||||
language = canonicalize_lang(self.language)
|
||||
if language is not None:
|
||||
mi.language = language
|
||||
# This one affects the pub date shown in kindle title
|
||||
#mi.pubdate = nowf()
|
||||
# now appears to need the time field to be > 12.00noon as well
|
||||
mi.pubdate = datetime.datetime(int(self.get_fetchyear()), int(self.get_fetchmonth()), int(self.get_fetchday()), 12, 30, 0)
|
||||
opf_path = os.path.join(dir, 'index.opf')
|
||||
ncx_path = os.path.join(dir, 'index.ncx')
|
||||
|
||||
opf = OPFCreator(dir, mi)
|
||||
# Add mastheadImage entry to <guide> section
|
||||
mp = getattr(self, 'masthead_path', None)
|
||||
if mp is not None and os.access(mp, os.R_OK):
|
||||
from calibre.ebooks.metadata.opf2 import Guide
|
||||
ref = Guide.Reference(os.path.basename(self.masthead_path), os.getcwdu())
|
||||
ref.type = 'masthead'
|
||||
ref.title = 'Masthead Image'
|
||||
opf.guide.append(ref)
|
||||
|
||||
manifest = [os.path.join(dir, 'feed_%d'%i) for i in range(len(feeds))]
|
||||
manifest.append(os.path.join(dir, 'index.html'))
|
||||
manifest.append(os.path.join(dir, 'index.ncx'))
|
||||
|
||||
# Get cover
|
||||
cpath = getattr(self, 'cover_path', None)
|
||||
if cpath is None:
|
||||
pf = open(os.path.join(dir, 'cover.jpg'), 'wb')
|
||||
if self.default_cover(pf):
|
||||
cpath = pf.name
|
||||
if cpath is not None and os.access(cpath, os.R_OK):
|
||||
opf.cover = cpath
|
||||
manifest.append(cpath)
|
||||
|
||||
# Get masthead
|
||||
mpath = getattr(self, 'masthead_path', None)
|
||||
if mpath is not None and os.access(mpath, os.R_OK):
|
||||
manifest.append(mpath)
|
||||
|
||||
opf.create_manifest_from_files_in(manifest)
|
||||
for mani in opf.manifest:
|
||||
if mani.path.endswith('.ncx'):
|
||||
mani.id = 'ncx'
|
||||
if mani.path.endswith('mastheadImage.jpg'):
|
||||
mani.id = 'masthead-image'
|
||||
|
||||
entries = ['index.html']
|
||||
toc = TOC(base_path=dir)
|
||||
self.play_order_counter = 0
|
||||
self.play_order_map = {}
|
||||
|
||||
|
||||
def feed_index(num, parent):
|
||||
f = feeds[num]
|
||||
for j, a in enumerate(f):
|
||||
if getattr(a, 'downloaded', False):
|
||||
adir = 'feed_%d/article_%d/'%(num, j)
|
||||
auth = a.author
|
||||
if not auth:
|
||||
auth = None
|
||||
desc = a.text_summary
|
||||
if not desc:
|
||||
desc = None
|
||||
else:
|
||||
desc = self.description_limiter(desc)
|
||||
tt = a.toc_thumbnail if a.toc_thumbnail else None
|
||||
entries.append('%sindex.html'%adir)
|
||||
po = self.play_order_map.get(entries[-1], None)
|
||||
if po is None:
|
||||
self.play_order_counter += 1
|
||||
po = self.play_order_counter
|
||||
parent.add_item('%sindex.html'%adir, None,
|
||||
a.title if a.title else ('Untitled Article'),
|
||||
play_order=po, author=auth,
|
||||
description=desc, toc_thumbnail=tt)
|
||||
last = os.path.join(self.output_dir, ('%sindex.html'%adir).replace('/', os.sep))
|
||||
for sp in a.sub_pages:
|
||||
prefix = os.path.commonprefix([opf_path, sp])
|
||||
relp = sp[len(prefix):]
|
||||
entries.append(relp.replace(os.sep, '/'))
|
||||
last = sp
|
||||
|
||||
if os.path.exists(last):
|
||||
with open(last, 'rb') as fi:
|
||||
src = fi.read().decode('utf-8')
|
||||
soup = BeautifulSoup(src)
|
||||
body = soup.find('body')
|
||||
if body is not None:
|
||||
prefix = '/'.join('..'for i in range(2*len(re.findall(r'link\d+', last))))
|
||||
templ = self.navbar.generate(True, num, j, len(f),
|
||||
not self.has_single_feed,
|
||||
a.orig_url, __appname__, prefix=prefix,
|
||||
center=self.center_navbar)
|
||||
elem = BeautifulSoup(templ.render(doctype='xhtml').decode('utf-8')).find('div')
|
||||
body.insert(len(body.contents), elem)
|
||||
with open(last, 'wb') as fi:
|
||||
fi.write(unicode(soup).encode('utf-8'))
|
||||
if len(feeds) == 0:
|
||||
raise Exception('All feeds are empty, aborting.')
|
||||
|
||||
if len(feeds) > 1:
|
||||
for i, f in enumerate(feeds):
|
||||
entries.append('feed_%d/index.html'%i)
|
||||
po = self.play_order_map.get(entries[-1], None)
|
||||
if po is None:
|
||||
self.play_order_counter += 1
|
||||
po = self.play_order_counter
|
||||
auth = getattr(f, 'author', None)
|
||||
if not auth:
|
||||
auth = None
|
||||
desc = getattr(f, 'description', None)
|
||||
if not desc:
|
||||
desc = None
|
||||
feed_index(i, toc.add_item('feed_%d/index.html'%i, None,
|
||||
f.title, play_order=po, description=desc, author=auth))
|
||||
|
||||
else:
|
||||
entries.append('feed_%d/index.html'%0)
|
||||
feed_index(0, toc)
|
||||
|
||||
for i, p in enumerate(entries):
|
||||
entries[i] = os.path.join(dir, p.replace('/', os.sep))
|
||||
opf.create_spine(entries)
|
||||
opf.set_toc(toc)
|
||||
|
||||
with nested(open(opf_path, 'wb'), open(ncx_path, 'wb')) as (opf_file, ncx_file):
|
||||
opf.render(opf_file, ncx_file)
|
||||
|
||||
|
||||
|
12
recipes/sivil_dusunce.recipe
Normal file
12
recipes/sivil_dusunce.recipe
Normal file
@ -0,0 +1,12 @@
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class BasicUserRecipe1324913680(BasicNewsRecipe):
|
||||
title = u'Sivil Dusunce'
|
||||
language = 'tr'
|
||||
__author__ = 'asalet_r'
|
||||
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 20
|
||||
auto_cleanup = True
|
||||
|
||||
feeds = [(u'Sivil Dusunce', u'http://www.sivildusunce.com/feed/')]
|
@ -14,6 +14,7 @@ class TagesspiegelRSS(BasicNewsRecipe):
|
||||
language = 'de'
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 100
|
||||
publication_type = 'newspaper'
|
||||
|
||||
extra_css = '''
|
||||
.hcf-overline{color:#990000; font-family:Arial,Helvetica,sans-serif;font-size:xx-small;display:block}
|
||||
@ -33,17 +34,15 @@ class TagesspiegelRSS(BasicNewsRecipe):
|
||||
no_javascript = True
|
||||
remove_empty_feeds = True
|
||||
encoding = 'utf-8'
|
||||
remove_tags = [{'class':'hcf-header'}, {'class':'hcf-atlas'}, {'class':'hcf-date hcf-separate'}]
|
||||
|
||||
keep_only_tags = dict(name='div', attrs={'class':["hcf-article"]})
|
||||
remove_tags = [
|
||||
dict(name='link'), dict(name='iframe'),dict(name='style'),dict(name='meta'),dict(name='button'),
|
||||
dict(name='div', attrs={'class':["hcf-jump-to-comments","hcf-clear","hcf-magnify hcf-media-control",
|
||||
"hcf-socials-widgets hcf-socials-top","hcf-socials-widgets hcf-socials-bottom"] }),
|
||||
dict(name='span', attrs={'class':["hcf-mainsearch",] }),
|
||||
dict(name='ul', attrs={'class':["hcf-tools"]}),
|
||||
dict(name='ul', attrs={'class': re.compile('hcf-services')})
|
||||
]
|
||||
|
||||
def print_version(self, url):
|
||||
url = url.split('/')
|
||||
url[-1] = 'v_print,%s?p='%url[-1]
|
||||
return '/'.join(url)
|
||||
|
||||
def get_masthead_url(self):
|
||||
return 'http://www.tagesspiegel.de/images/tsp_logo/3114/6.png'
|
||||
|
||||
def parse_index(self):
|
||||
soup = self.index_to_soup('http://www.tagesspiegel.de/zeitung/')
|
||||
@ -56,7 +55,7 @@ class TagesspiegelRSS(BasicNewsRecipe):
|
||||
ans = []
|
||||
maincol = soup.find('div', attrs={'class':re.compile('hcf-main-col')})
|
||||
|
||||
for div in maincol.findAll(True, attrs={'class':['hcf-teaser', 'hcf-header', 'story headline']}):
|
||||
for div in maincol.findAll(True, attrs={'class':['hcf-teaser', 'hcf-header', 'story headline', 'hcf-teaser hcf-last']}):
|
||||
|
||||
if div['class'] == 'hcf-header':
|
||||
try:
|
||||
@ -66,7 +65,7 @@ class TagesspiegelRSS(BasicNewsRecipe):
|
||||
except:
|
||||
continue
|
||||
|
||||
elif div['class'] == 'hcf-teaser' and getattr(div.contents[0],'name','') == 'h2':
|
||||
elif div['class'] in ['hcf-teaser', 'hcf-teaser hcf-last'] and getattr(div.contents[0],'name','') == 'h2':
|
||||
a = div.find('a', href=True)
|
||||
if not a:
|
||||
continue
|
||||
|
12
recipes/tasfiye_dergisi.recipe
Normal file
12
recipes/tasfiye_dergisi.recipe
Normal file
@ -0,0 +1,12 @@
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class BasicUserRecipe1324739957(BasicNewsRecipe):
|
||||
title = u'Tasfiye Dergisi'
|
||||
language = 'tr'
|
||||
__author__ = 'asalet_r'
|
||||
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 20
|
||||
auto_cleanup = True
|
||||
|
||||
feeds = [(u'Tasfiye Dergisi', u'http://www.tasfiyedergisi.com/direnen-edebiyat/?feed=rss2')]
|
25
recipes/tillsonburg.recipe
Normal file
25
recipes/tillsonburg.recipe
Normal file
@ -0,0 +1,25 @@
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
'''
|
||||
Tillsonburg/Norfolk County newspapers Calibre Recipe
|
||||
'''
|
||||
class TillsonburgNorfolkCounty(BasicNewsRecipe):
|
||||
title = u'Tillsonburg/Norfolk County'
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 100
|
||||
auto_cleanup = True
|
||||
__author__ = u'Eric Coolman'
|
||||
publisher = u'canoe.ca'
|
||||
description = u'Norfolk County and Tillsonburg, Ontario Canada Newspapers'
|
||||
category = u'News, Ontario, Canada'
|
||||
remove_javascript = True
|
||||
use_embedded_content = False
|
||||
no_stylesheets = True
|
||||
language = 'en_CA'
|
||||
encoding = 'utf-8'
|
||||
|
||||
feeds = [
|
||||
(u'Simcoe Reformer', u'http://www.simcoereformer.ca/rss/'),
|
||||
(u'Delhi News-Record', u'http://www.delhinewsrecord.com/rss/'),
|
||||
(u'Tilsonburg News', u'http://www.tillsonburgnews.com/rss/')
|
||||
]
|
@ -1,4 +1,4 @@
|
||||
import re
|
||||
import re, urllib
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class TimesOfIndia(BasicNewsRecipe):
|
||||
@ -17,7 +17,9 @@ class TimesOfIndia(BasicNewsRecipe):
|
||||
]
|
||||
remove_tags = [
|
||||
{'class':re.compile('tabsintbgshow|prvnxtbg')},
|
||||
{'id':['fbrecommend', 'relmaindiv']}
|
||||
{'id':['fbrecommend', 'relmaindiv', 'shretxt', 'fbrecos', 'twtdiv',
|
||||
'gpls', 'auim']},
|
||||
{'class':['twitter-share-button', 'cmtmn']},
|
||||
]
|
||||
|
||||
feeds = [
|
||||
@ -46,25 +48,27 @@ class TimesOfIndia(BasicNewsRecipe):
|
||||
]
|
||||
|
||||
def get_article_url(self, article):
|
||||
# Times of India sometimes serves an ad page instead of the article,
|
||||
# this code, detects and circumvents that
|
||||
url = BasicNewsRecipe.get_article_url(self, article)
|
||||
if '/0Ltimesofindia' in url:
|
||||
url = url.partition('/0L')[-1]
|
||||
url = url.replace('0B', '.').replace('0N', '.com').replace('0C',
|
||||
'/').replace('0E', '-')
|
||||
url = 'http://' + url.rpartition('/')[0]
|
||||
match = re.search(r'/([0-9a-zA-Z]+?)\.cms', url)
|
||||
if match is not None:
|
||||
num = match.group(1)
|
||||
num = re.sub(r'[^0-9]', '', num)
|
||||
return ('http://timesofindia.indiatimes.com/articleshow/%s.cms?prtpage=1' %
|
||||
num)
|
||||
else:
|
||||
cms = re.search(r'/(\d+)\.cms', url)
|
||||
if cms is not None:
|
||||
return ('http://timesofindia.indiatimes.com/articleshow/%s.cms?prtpage=1' %
|
||||
cms.group(1))
|
||||
try:
|
||||
s = article.summary
|
||||
return urllib.unquote(
|
||||
re.search(r'href=".+?bookmark.cfm.+?link=(.+?)"', s).group(1))
|
||||
except:
|
||||
pass
|
||||
link = article.get('link', None)
|
||||
if link and link.split('/')[-1]=="story01.htm":
|
||||
link=link.split('/')[-2]
|
||||
encoding = {'0B': '.', '0C': '/', '0A': '0', '0F': '=', '0G': '&',
|
||||
'0D': '?', '0E': '-', '0N': '.com', '0L': 'http://'}
|
||||
for k, v in encoding.iteritems():
|
||||
link = link.replace(k, v)
|
||||
return link
|
||||
|
||||
return url
|
||||
def print_version(self, url):
|
||||
return url + '?prtpage=1'
|
||||
|
||||
def preprocess_html(self, soup, *args):
|
||||
byl = soup.find(attrs={'class':'byline'})
|
||||
if byl is not None:
|
||||
for l in byl.findAll('label'):
|
||||
l.extract()
|
||||
return soup
|
||||
|
66
recipes/tweakers_net.recipe
Normal file
66
recipes/tweakers_net.recipe
Normal file
@ -0,0 +1,66 @@
|
||||
#!/usr/bin/env python
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
from __future__ import with_statement
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import re
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class Tweakers(BasicNewsRecipe):
|
||||
title = u'Tweakers.net - with Reactions'
|
||||
__author__ = 'Roedi06'
|
||||
language = 'nl'
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 100
|
||||
cover_url = 'http://img51.imageshack.us/img51/7470/tweakersnetebook.gif'
|
||||
|
||||
keep_only_tags = [dict(name='div', attrs={'class':'columnwrapper news'}),
|
||||
{'id':'reacties'},
|
||||
]
|
||||
|
||||
remove_tags = [dict(name='div', attrs={'id' : ['utracker']}),
|
||||
{'id' : ['channelNav']},
|
||||
{'id' : ['contentArea']},
|
||||
{'class' : ['breadCrumb']},
|
||||
{'class' : ['nextPrevious ellipsis']},
|
||||
{'class' : ['advertorial']},
|
||||
{'class' : ['sidebar']},
|
||||
{'class' : ['filterBox']},
|
||||
{'id' : ['toggleButtonTxt']},
|
||||
{'id' : ['socialButtons']},
|
||||
{'class' : ['button']},
|
||||
{'class' : ['textadTop']},
|
||||
{'class' : ['commentLink']},
|
||||
{'title' : ['Reageer op deze reactie']},
|
||||
{'class' : ['pageIndex']},
|
||||
{'class' : ['reactieHeader collapsed']},
|
||||
]
|
||||
no_stylesheets=True
|
||||
|
||||
preprocess_regexps = [
|
||||
(re.compile(r'<hr*?>', re.IGNORECASE | re.DOTALL), lambda match : ''),
|
||||
(re.compile(r'<p>', re.IGNORECASE | re.DOTALL), lambda match : ''),
|
||||
(re.compile(r'</p>', re.IGNORECASE | re.DOTALL), lambda match : ''),
|
||||
(re.compile(r'<a.*?>'), lambda h1: '<b><u>'),
|
||||
(re.compile(r'</a>'), lambda h2: '</u></b>'),
|
||||
(re.compile(r'<span class="new">', re.IGNORECASE | re.DOTALL), lambda match : ''),
|
||||
(re.compile(r'</span>', re.IGNORECASE | re.DOTALL), lambda match : ''),
|
||||
(re.compile(r'<div class="moderation"><img src="http://tweakimg.net/g/if/comments/score_0'), lambda match : ' - moderated 0<div class="moderation"><img src="http://tweakimg.net/g/if/comments/score_0'),
|
||||
(re.compile(r'<div class="moderation"><img src="http://tweakimg.net/g/if/comments/score_1'), lambda match : ' - moderated +1<div class="moderation"><img src="http://tweakimg.net/g/if/comments/score_1'),
|
||||
(re.compile(r'<div class="moderation"><img src="http://tweakimg.net/g/if/comments/score_2'), lambda match : ' - moderated +2<div class="moderation"><img src="http://tweakimg.net/g/if/comments/score_2'),
|
||||
(re.compile(r'<div class="moderation"><img src="http://tweakimg.net/g/if/comments/score_3'), lambda match : ' - moderated +3<div class="moderation"><img src="http://tweakimg.net/g/if/comments/score_3'),
|
||||
(re.compile(r'<div class="moderation">.*?</div>'), lambda h1: ''),
|
||||
]
|
||||
|
||||
extra_css = '.reactieHeader { color: #333333; font-size: 6px; border-bottom:solid 2px #333333; border-top:solid 1px #333333; } \
|
||||
.reactieContent { font-family:"Times New Roman",Georgia,Serif; color: #000000; font-size: 8px; } \
|
||||
.quote { font-family:"Times New Roman",Georgia,Serif; padding-left:2px; border-left:solid 3px #666666; color: #666666; }'
|
||||
|
||||
|
||||
feeds = [(u'Tweakers.net', u'http://feeds.feedburner.com/tweakers/nieuws')]
|
||||
|
||||
def print_version(self, url):
|
||||
return url + '?max=200'
|
||||
|
@ -11,8 +11,9 @@ from calibre.web.feeds.news import BasicNewsRecipe
|
||||
class USAToday(BasicNewsRecipe):
|
||||
|
||||
title = 'USA Today'
|
||||
__author__ = 'calibre'
|
||||
__author__ = 'Kovid Goyal'
|
||||
description = 'newspaper'
|
||||
cover_url = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg12/lg/USAT.jpg'
|
||||
encoding = 'utf-8'
|
||||
publisher = 'usatoday.com'
|
||||
category = 'news, usa'
|
||||
@ -47,32 +48,7 @@ class USAToday(BasicNewsRecipe):
|
||||
('Offbeat News', 'http://rssfeeds.usatoday.com/UsatodaycomOffbeat-TopStories')
|
||||
]
|
||||
|
||||
keep_only_tags = [dict(attrs={'class':'story'})]
|
||||
|
||||
remove_tags = [
|
||||
dict(attrs={'class':[
|
||||
'share',
|
||||
'reprints',
|
||||
'inline-h3',
|
||||
'info-extras rounded',
|
||||
'inset',
|
||||
'ppy-outer',
|
||||
'ppy-caption',
|
||||
'comments',
|
||||
'jump',
|
||||
'pagetools',
|
||||
'post-attributes',
|
||||
'tags',
|
||||
'bottom-tools',
|
||||
'sponsoredlinks',
|
||||
'corrections'
|
||||
]}),
|
||||
dict(name='ul', attrs={'class':'inside-copy'}),
|
||||
dict(id=['pluck']),
|
||||
dict(id=['updated']),
|
||||
dict(id=['post-date-updated'])
|
||||
]
|
||||
|
||||
auto_cleanup = True
|
||||
|
||||
def get_masthead_url(self):
|
||||
masthead = 'http://i.usatoday.net/mobile/_common/_images/565x73_usat_mobile.gif'
|
||||
|
@ -1,5 +1,5 @@
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2009-2010, Darko Miletic <darko.miletic at gmail.com>'
|
||||
__copyright__ = '2009-2012, Darko Miletic <darko.miletic at gmail.com>'
|
||||
'''
|
||||
www.variety.com
|
||||
'''
|
||||
@ -14,11 +14,11 @@ class Variety(BasicNewsRecipe):
|
||||
max_articles_per_feed = 100
|
||||
no_stylesheets = True
|
||||
use_embedded_content = False
|
||||
encoding = 'cp1252'
|
||||
encoding = 'utf8'
|
||||
publisher = 'Red Business Information'
|
||||
category = 'Entertainment Industry News, Daily Variety, Movie Reviews, TV, Awards, Oscars, Cannes, Box Office, Hollywood'
|
||||
language = 'en'
|
||||
masthead_url = 'http://a330.g.akamai.net/7/330/23382/20090528190853/www.variety.com/graphics/variety/Variety_logo_green_tm.gif'
|
||||
masthead_url = 'http://images1.variety.com/graphics/variety/Variety_logo_green_tm.gif'
|
||||
extra_css = ' body{font-family: Georgia,"Times New Roman",Times,Courier,serif } img{margin-bottom: 1em} '
|
||||
|
||||
conversion_options = {
|
||||
@ -30,17 +30,10 @@ class Variety(BasicNewsRecipe):
|
||||
|
||||
remove_tags = [dict(name=['object','link','map'])]
|
||||
|
||||
keep_only_tags = [dict(name='div', attrs={'id':'article'})]
|
||||
keep_only_tags = [dict(name='div', attrs={'class':'art control'})]
|
||||
|
||||
feeds = [(u'News & Articles', u'http://feeds.feedburner.com/variety/headlines' )]
|
||||
|
||||
def print_version(self, url):
|
||||
rpt = url.rpartition('?')[0]
|
||||
artid = rpt.rpartition('/')[2]
|
||||
catidr = url.rpartition('categoryid=')[2]
|
||||
catid = catidr.partition('&')[0]
|
||||
return 'http://www.variety.com/index.asp?layout=print_story&articleid=' + artid + '&categoryid=' + catid
|
||||
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
return self.adeify_images(soup)
|
||||
rpt = url.rpartition('.html')[0]
|
||||
return rpt + '?printerfriendly=true'
|
||||
|
46
recipes/villagevoice.recipe
Normal file
46
recipes/villagevoice.recipe
Normal file
@ -0,0 +1,46 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
import re
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class VillageVoice(BasicNewsRecipe):
|
||||
|
||||
title = 'Village Voice'
|
||||
feeds = [
|
||||
("Complete Issue", "http://villagevoice.com/syndication/issue"),
|
||||
("News", "http://villagevoice.com/syndication/section/news"),
|
||||
("Music", "http://villagevoice.com/syndication/section/music"),
|
||||
("Movies", "http://villagevoice.com/syndication/section/film"),
|
||||
#("Restaurants", "http://villagevoice.com/syndication/section/dining"),
|
||||
#("Music Events", "http://villagevoice.com/syndication/events?type=music"),
|
||||
#("Calendar Events", "http://villagevoice.com/syndication/events"),
|
||||
#("Promotional Events", "http://villagevoice.com/syndication/promoEvents"),
|
||||
#("Restaurant Guide", "http://villagevoice.com/syndication/restaurants/search")
|
||||
]
|
||||
|
||||
auto_cleanup = True
|
||||
max_articles_per_feed = 50
|
||||
masthead_url = "http://assets.villagevoice.com/img/citylogo.png"
|
||||
language = 'en'
|
||||
__author__ = 'Barty'
|
||||
|
||||
seen_urls = []
|
||||
|
||||
# village voice breaks the article up into multiple pages, so
|
||||
# parse page and grab the print url
|
||||
|
||||
url_regex = re.compile(r'\/content\/printVersion\/\d+',re.I)
|
||||
|
||||
def print_version(self, url):
|
||||
if url in self.seen_urls:
|
||||
return None
|
||||
self.seen_urls.append( url)
|
||||
soup = self.index_to_soup(url)
|
||||
atag = soup.find('a',attrs={'href':self.url_regex})
|
||||
if atag is None:
|
||||
self.log('Warning: no print url found for '+url)
|
||||
else:
|
||||
m = self.url_regex.search(atag['href'])
|
||||
if m:
|
||||
url = 'http://www.villagevoice.com'+m.group(0)
|
||||
return url
|
12
recipes/wired_it.recipe
Normal file
12
recipes/wired_it.recipe
Normal file
@ -0,0 +1,12 @@
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class AdvancedUserRecipe1325758162(BasicNewsRecipe):
|
||||
title = u'Wired'
|
||||
language = 'it'
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 100
|
||||
auto_cleanup = True
|
||||
remove_tags_after = [dict(name='div', attrs={'class':'article_content'})]
|
||||
feeds = [(u'Wired', u'http://www.wired.it/rss.xml')]
|
||||
__author__ = 'faber1971'
|
||||
description = 'An American magazine that reports on how new technology affects culture, the economy, and politics'
|
@ -14,7 +14,7 @@
|
||||
|
||||
.button a, .button:visited a {
|
||||
padding: 0.5em;
|
||||
font-size: 1.25em;
|
||||
font-size: larger;
|
||||
border: 1px solid black;
|
||||
text-color: black;
|
||||
text-decoration: none;
|
||||
|
@ -196,7 +196,9 @@ title_series_sorting = 'library_order'
|
||||
# set to 'strictly_alphabetic', the series will be sent without change.
|
||||
# For example, if the tweak is set to library_order, "The Lord of the Rings"
|
||||
# will become "Lord of the Rings, The". If the tweak is set to
|
||||
# strictly_alphabetic, it would remain "The Lord of the Rings".
|
||||
# strictly_alphabetic, it would remain "The Lord of the Rings". Note that the
|
||||
# formatter function raw_field will return the base value for title and
|
||||
# series regardless of the setting of this tweak.
|
||||
save_template_title_series_sorting = 'library_order'
|
||||
|
||||
#: Set the list of words considered to be "articles" for sort strings
|
||||
@ -291,7 +293,7 @@ auto_connect_to_folder = ''
|
||||
# how the value and category are combined together to make the collection name.
|
||||
# The only two fields available are {category} and {value}. The {value} field is
|
||||
# never empty. The {category} field can be empty. The default is to put the
|
||||
# value first, then the category enclosed in parentheses, it is isn't empty:
|
||||
# value first, then the category enclosed in parentheses, it isn't empty:
|
||||
# '{value} {category:|(|)}'
|
||||
# Examples: The first three examples assume that the second tweak
|
||||
# has not been changed.
|
||||
@ -471,3 +473,14 @@ unified_title_toolbar_on_osx = False
|
||||
# this to False you can prevent calibre from saving the original file.
|
||||
save_original_format = True
|
||||
|
||||
#: Number of recently viewed books to show
|
||||
# Right-clicking the View button shows a list of recently viewed books. Control
|
||||
# how many should be shown, here.
|
||||
gui_view_history_size = 15
|
||||
|
||||
#: When using the 'Tweak Book' action, which format to prefer
|
||||
# When tweaking a book that has multiple formats, calibre picks one
|
||||
# automatically. By default EPUB is preferred to HTMLZ. If you would like to
|
||||
# prefer HTMLZ to EPUB for tweaking, change this to 'htmlz'
|
||||
tweak_book_prefer = 'epub'
|
||||
|
||||
|
@ -1,5 +1,5 @@
|
||||
/** @license Hyphenator X.Y.Z - client side hyphenation for webbrowsers
|
||||
* Copyright (C) 2010 Mathias Nater, Zürich (mathias at mnn dot ch)
|
||||
/** @license Hyphenator 4.0.0 - client side hyphenation for webbrowsers
|
||||
* Copyright (C) 2011 Mathias Nater, Zürich (mathias at mnn dot ch)
|
||||
* Project and Source hosted on http://code.google.com/p/hyphenator/
|
||||
*
|
||||
* This JavaScript code is free software: you can redistribute
|
||||
@ -15,6 +15,40 @@
|
||||
* that code without the copy of the GNU GPL normally required by
|
||||
* section 4, provided you include this license notice and a URL
|
||||
* through which recipients can access the Corresponding Source.
|
||||
*
|
||||
*
|
||||
* Hyphenator.js contains code from Bram Steins hypher.js-Project:
|
||||
* https://github.com/bramstein/Hypher
|
||||
*
|
||||
* Code from this project is marked in the source and belongs
|
||||
* to the following license:
|
||||
*
|
||||
* Copyright (c) 2011, Bram Stein
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* 3. The name of the author may not be used to endorse or promote products
|
||||
* derived from this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR "AS IS" AND ANY EXPRESS OR IMPLIED
|
||||
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
|
||||
* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
|
||||
* EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
|
||||
* INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
|
||||
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
||||
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
|
||||
* EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
*/
|
||||
|
||||
/*
|
||||
@ -56,6 +90,7 @@ var Hyphenator = (function (window) {
|
||||
*/
|
||||
supportedLang = {
|
||||
'be': 'be.js',
|
||||
'ca': 'ca.js',
|
||||
'cs': 'cs.js',
|
||||
'da': 'da.js',
|
||||
'bn': 'bn.js',
|
||||
@ -80,14 +115,16 @@ var Hyphenator = (function (window) {
|
||||
'lt': 'lt.js',
|
||||
'lv': 'lv.js',
|
||||
'ml': 'ml.js',
|
||||
'no': 'no-nb.js',
|
||||
'no-nb': 'no-nb.js',
|
||||
'nb': 'nb-no.js',
|
||||
'no': 'nb-no.js',
|
||||
'nb-no': 'nb-no.js',
|
||||
'nl': 'nl.js',
|
||||
'or': 'or.js',
|
||||
'pa': 'pa.js',
|
||||
'pl': 'pl.js',
|
||||
'pt': 'pt.js',
|
||||
'ru': 'ru.js',
|
||||
'sk': 'sk.js',
|
||||
'sl': 'sl.js',
|
||||
'sv': 'sv.js',
|
||||
'ta': 'ta.js',
|
||||
@ -235,7 +272,7 @@ var Hyphenator = (function (window) {
|
||||
* @private
|
||||
* @see Hyphenator-hyphenateElement
|
||||
*/
|
||||
dontHyphenate = {'script': true, 'code': true, 'pre': true, 'img': true, 'br': true, 'samp': true, 'kbd': true, 'var': true, 'abbr': true, 'acronym': true, 'sub': true, 'sup': true, 'button': true, 'option': true, 'label': true, 'textarea': true, 'input': true},
|
||||
dontHyphenate = {'script': true, 'code': true, 'pre': true, 'img': true, 'br': true, 'samp': true, 'kbd': true, 'var': true, 'abbr': true, 'acronym': true, 'sub': true, 'sup': true, 'button': true, 'option': true, 'label': true, 'textarea': true, 'input': true, 'math': true, 'svg': true},
|
||||
|
||||
/**
|
||||
* @name Hyphenator-enableCache
|
||||
@ -308,6 +345,86 @@ var Hyphenator = (function (window) {
|
||||
* @see Hyphenator-toggleBox
|
||||
*/
|
||||
displayToggleBox = false,
|
||||
|
||||
/**
|
||||
* @name Hyphenator-css3
|
||||
* @description
|
||||
* A variable to set if css3 hyphenation should be used
|
||||
* @type boolean
|
||||
* @default false
|
||||
* @private
|
||||
* @see Hyphenator.config
|
||||
*/
|
||||
css3 = false,
|
||||
/**
|
||||
* @name Hyphenator-css3_hsupport
|
||||
* @description
|
||||
* A generated object containing information for CSS3-hyphenation support
|
||||
* {
|
||||
* support: boolean,
|
||||
* property: <the property name to access hyphen-settings>,
|
||||
* languages: <an object containing supported languages>
|
||||
* }
|
||||
* @type object
|
||||
* @default undefined
|
||||
* @private
|
||||
* @see Hyphenator-css3_gethsupport
|
||||
*/
|
||||
css3_h9n,
|
||||
/**
|
||||
* @name Hyphenator-css3_gethsupport
|
||||
* @description
|
||||
* This function sets Hyphenator-css3_h9n for the current UA
|
||||
* @type function
|
||||
* @private
|
||||
* @see Hyphenator-css3_h9n
|
||||
*/
|
||||
css3_gethsupport = function () {
|
||||
var s,
|
||||
ua = navigator.userAgent,
|
||||
r = {
|
||||
support: false,
|
||||
property: '',
|
||||
languages: {}
|
||||
};
|
||||
if (window.getComputedStyle) {
|
||||
s = window.getComputedStyle(window.document.getElementsByTagName('body')[0]);
|
||||
} else {
|
||||
//ancient Browser don't support CSS3 anyway
|
||||
css3_h9n = r;
|
||||
return;
|
||||
}
|
||||
if (ua.indexOf('Chrome') !== -1) {
|
||||
//Chrome actually knows -webkit-hyphens but does no hyphenation
|
||||
r.support = false;
|
||||
} else if ((ua.indexOf('Safari') !== -1) && (s['-webkit-hyphens'] !== undefined)) {
|
||||
r.support = true;
|
||||
r.property = '-webkit-hyphens';
|
||||
if (ua.indexOf('Mobile') !== -1) {
|
||||
//iOS only hyphenates in systemlanguage
|
||||
r.languages[navigator.language.split('-')[0]] = true;
|
||||
} else {
|
||||
//Desktop Safari only hyphenates some languages:
|
||||
r.languages = {
|
||||
de: true,
|
||||
en: true,
|
||||
es: true,
|
||||
fr: true,
|
||||
it: true,
|
||||
nl: true,
|
||||
ru: true,
|
||||
zh: true
|
||||
};
|
||||
}
|
||||
} else if ((ua.indexOf('Firefox') !== -1) && (s['MozHyphens'] !== undefined)) {
|
||||
r.support = true;
|
||||
r.property = 'MozHyphens';
|
||||
r.languages = {
|
||||
en: true
|
||||
};
|
||||
}
|
||||
css3_h9n = r;
|
||||
},
|
||||
|
||||
/**
|
||||
* @name Hyphenator-hyphenateClass
|
||||
@ -404,6 +521,7 @@ var Hyphenator = (function (window) {
|
||||
* @see Hyphenator-autoSetMainLanguage
|
||||
*/
|
||||
defaultLanguage = '',
|
||||
|
||||
|
||||
/**
|
||||
* @name Hyphenator-elements
|
||||
@ -413,7 +531,38 @@ var Hyphenator = (function (window) {
|
||||
* @type {Array}
|
||||
* @private
|
||||
*/
|
||||
elements = [],
|
||||
elements = (function () {
|
||||
var Element = function (element, data) {
|
||||
this.element = element;
|
||||
this.hyphenated = false;
|
||||
this.treated = false; //collected but not hyphenated (dohyphenation is off)
|
||||
this.data = data;
|
||||
},
|
||||
ElementCollection = function () {
|
||||
this.count = 0;
|
||||
this.hyCount = 0;
|
||||
this.list = {};
|
||||
};
|
||||
ElementCollection.prototype = {
|
||||
add: function (el, lang, data) {
|
||||
if (!this.list.hasOwnProperty(lang)) {
|
||||
this.list[lang] = [];
|
||||
}
|
||||
this.list[lang].push(new Element(el, data));
|
||||
this.count += 1;
|
||||
},
|
||||
each: function (fn) {
|
||||
var k;
|
||||
for (k in this.list) {
|
||||
if (this.list.hasOwnProperty(k)) {
|
||||
fn(k, this.list[k]);
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
return new ElementCollection();
|
||||
}()),
|
||||
|
||||
|
||||
/**
|
||||
* @name Hyphenator-exceptions
|
||||
@ -426,15 +575,6 @@ var Hyphenator = (function (window) {
|
||||
*/
|
||||
exceptions = {},
|
||||
|
||||
countObjProps = function (obj) {
|
||||
var k, l = 0;
|
||||
for (k in obj) {
|
||||
if (obj.hasOwnProperty(k)) {
|
||||
l++;
|
||||
}
|
||||
}
|
||||
return l;
|
||||
},
|
||||
/**
|
||||
* @name Hyphenator-docLanguages
|
||||
* @description
|
||||
@ -445,7 +585,6 @@ var Hyphenator = (function (window) {
|
||||
*/
|
||||
docLanguages = {},
|
||||
|
||||
|
||||
/**
|
||||
* @name Hyphenator-state
|
||||
* @description
|
||||
@ -583,6 +722,18 @@ var Hyphenator = (function (window) {
|
||||
*/
|
||||
intermediateState = 'hidden',
|
||||
|
||||
/**
|
||||
* @name Hyphenator-unhide
|
||||
* @description
|
||||
* How hidden elements unhide: either simultaneous (default: 'wait') or progressively.
|
||||
* 'wait' makes Hyphenator.js to wait until all elements are hyphenated (one redraw)
|
||||
* With 'progressiv' Hyphenator.js unhides elements as soon as they are hyphenated.
|
||||
* @see Hyphenator.config
|
||||
* @type {string}
|
||||
* @private
|
||||
*/
|
||||
unhide = 'wait',
|
||||
|
||||
/**
|
||||
* @name Hyphenator-hyphen
|
||||
* @description
|
||||
@ -619,50 +770,6 @@ var Hyphenator = (function (window) {
|
||||
*/
|
||||
safeCopy = true,
|
||||
|
||||
/**
|
||||
* @name Hyphenator-Expando
|
||||
* @description
|
||||
* This custom object stores data for elements: storing data directly in elements
|
||||
* (DomElement.customData = foobar;) isn't a good idea. It would lead to conflicts
|
||||
* in form elements, when the form has a child with name="foobar". Therefore, this
|
||||
* solution follows the approach of jQuery: the data is stored in an object and
|
||||
* referenced by a unique attribute of the element. The attribute has a name that
|
||||
* is built by the prefix "HyphenatorExpando_" and a random number, so if the very
|
||||
* very rare case occurs, that there's already an attribute with the same name, a
|
||||
* simple reload is enough to make it function.
|
||||
* @private
|
||||
*/
|
||||
Expando = (function () {
|
||||
var container = {},
|
||||
name = "HyphenatorExpando_" + Math.random(),
|
||||
uuid = 0;
|
||||
return {
|
||||
getDataForElem : function (elem) {
|
||||
return container[elem[name].id];
|
||||
},
|
||||
setDataForElem : function (elem, data) {
|
||||
var id;
|
||||
if (elem[name] && elem[name].id !== '') {
|
||||
id = elem[name].id;
|
||||
} else {
|
||||
id = uuid++;
|
||||
elem[name] = {'id': id}; //object needed, otherways it is reflected in HTML in IE
|
||||
}
|
||||
container[id] = data;
|
||||
},
|
||||
appendDataForElem : function (elem, data) {
|
||||
var k;
|
||||
for (k in data) {
|
||||
if (data.hasOwnProperty(k)) {
|
||||
container[elem[name].id][k] = data[k];
|
||||
}
|
||||
}
|
||||
},
|
||||
delDataOfElem : function (elem) {
|
||||
delete container[elem[name]];
|
||||
}
|
||||
};
|
||||
}()),
|
||||
|
||||
/*
|
||||
* runOnContentLoaded is based od jQuery.bindReady()
|
||||
@ -915,36 +1022,41 @@ var Hyphenator = (function (window) {
|
||||
var elToProcess, tmp, i = 0,
|
||||
process = function (el, hide, lang) {
|
||||
var n, i = 0, hyphenatorSettings = {};
|
||||
if (hide && intermediateState === 'hidden') {
|
||||
if (!!el.getAttribute('style')) {
|
||||
hyphenatorSettings.hasOwnStyle = true;
|
||||
} else {
|
||||
hyphenatorSettings.hasOwnStyle = false;
|
||||
}
|
||||
hyphenatorSettings.isHidden = true;
|
||||
el.style.visibility = 'hidden';
|
||||
}
|
||||
|
||||
if (el.lang && typeof(el.lang) === 'string') {
|
||||
hyphenatorSettings.language = el.lang.toLowerCase(); //copy attribute-lang to internal lang
|
||||
lang = el.lang.toLowerCase(); //copy attribute-lang to internal lang
|
||||
} else if (lang) {
|
||||
hyphenatorSettings.language = lang.toLowerCase();
|
||||
lang = lang.toLowerCase();
|
||||
} else {
|
||||
hyphenatorSettings.language = getLang(el, true);
|
||||
lang = getLang(el, true);
|
||||
}
|
||||
lang = hyphenatorSettings.language;
|
||||
if (supportedLang[lang]) {
|
||||
docLanguages[lang] = true;
|
||||
} else {
|
||||
if (supportedLang.hasOwnProperty(lang.split('-')[0])) { //try subtag
|
||||
lang = lang.split('-')[0];
|
||||
hyphenatorSettings.language = lang;
|
||||
} else if (!isBookmarklet) {
|
||||
onError(new Error('Language ' + lang + ' is not yet supported.'));
|
||||
}
|
||||
}
|
||||
Expando.setDataForElem(el, hyphenatorSettings);
|
||||
|
||||
elements.push(el);
|
||||
//if css3-hyphenation is supported: use it!
|
||||
if (css3 && css3_h9n.support && !!css3_h9n.languages[lang]) {
|
||||
el.style[css3_h9n.property] = "auto";
|
||||
el.style['-webkit-locale'] = "'" + lang + "'";
|
||||
} else {
|
||||
if (intermediateState === 'hidden') {
|
||||
if (!!el.getAttribute('style')) {
|
||||
hyphenatorSettings.hasOwnStyle = true;
|
||||
} else {
|
||||
hyphenatorSettings.hasOwnStyle = false;
|
||||
}
|
||||
hyphenatorSettings.isHidden = true;
|
||||
el.style.visibility = 'hidden';
|
||||
}
|
||||
if (supportedLang[lang]) {
|
||||
docLanguages[lang] = true;
|
||||
} else {
|
||||
if (supportedLang.hasOwnProperty(lang.split('-')[0])) { //try subtag
|
||||
lang = lang.split('-')[0];
|
||||
hyphenatorSettings.language = lang;
|
||||
} else if (!isBookmarklet) {
|
||||
onError(new Error('Language ' + lang + ' is not yet supported.'));
|
||||
}
|
||||
}
|
||||
elements.add(el, lang, hyphenatorSettings);
|
||||
}
|
||||
while (!!(n = el.childNodes[i++])) {
|
||||
if (n.nodeType === 1 && !dontHyphenate[n.nodeName.toLowerCase()] &&
|
||||
n.className.indexOf(dontHyphenateClass) === -1 && !(n in elToProcess)) {
|
||||
@ -952,6 +1064,9 @@ var Hyphenator = (function (window) {
|
||||
}
|
||||
}
|
||||
};
|
||||
if (css3) {
|
||||
css3_gethsupport();
|
||||
}
|
||||
if (isBookmarklet) {
|
||||
elToProcess = contextWindow.document.getElementsByTagName('body')[0];
|
||||
process(elToProcess, false, mainLanguage);
|
||||
@ -962,44 +1077,109 @@ var Hyphenator = (function (window) {
|
||||
process(tmp, true, '');
|
||||
}
|
||||
}
|
||||
if (!Hyphenator.languages.hasOwnProperty(mainLanguage)) {
|
||||
docLanguages[mainLanguage] = true;
|
||||
} else if (!Hyphenator.languages[mainLanguage].prepared) {
|
||||
docLanguages[mainLanguage] = true;
|
||||
}
|
||||
if (elements.length > 0) {
|
||||
Expando.appendDataForElem(elements[elements.length - 1], {isLast : true});
|
||||
if (elements.count === 0) {
|
||||
//nothing to hyphenate or all hyphenated b css3
|
||||
state = 3;
|
||||
onHyphenationDone();
|
||||
}
|
||||
},
|
||||
|
||||
|
||||
/**
|
||||
* @name Hyphenator-convertPatterns
|
||||
* @name Hyphenator-createTrie
|
||||
* @description
|
||||
* Converts the patterns from string '_a6' to object '_a':'_a6'.
|
||||
* The result is stored in the {@link Hyphenator-patterns}-object.
|
||||
* converts patterns of the given language in a trie
|
||||
* @private
|
||||
* @param {string} lang the language whose patterns shall be converted
|
||||
*/
|
||||
convertPatterns = function (lang) {
|
||||
var plen, anfang, ende, pats, pat, key, tmp = {};
|
||||
pats = Hyphenator.languages[lang].patterns;
|
||||
for (plen in pats) {
|
||||
if (pats.hasOwnProperty(plen)) {
|
||||
plen = parseInt(plen, 10);
|
||||
anfang = 0;
|
||||
ende = plen;
|
||||
while (!!(pat = pats[plen].substring(anfang, ende))) {
|
||||
key = pat.replace(/\d/g, '');
|
||||
tmp[key] = pat;
|
||||
anfang = ende;
|
||||
ende += plen;
|
||||
/** @license BSD licenced code
|
||||
* The following code is based on code from hypher.js and adapted for Hyphenator.js
|
||||
* Copyright (c) 2011, Bram Stein
|
||||
*/
|
||||
var size = 0,
|
||||
tree = {
|
||||
tpoints: []
|
||||
},
|
||||
patterns, pattern, i, j, k,
|
||||
patternObject = Hyphenator.languages[lang].patterns,
|
||||
c, chars, points, t, p, codePoint,
|
||||
getPoints = (function () {
|
||||
//IE<9 doesn't act like other browsers
|
||||
if ('in3se'.split(/\D/).length === 1) {
|
||||
return function (pattern) {
|
||||
var chars = pattern.split(''), c, i, r = [],
|
||||
numb3rs = {'0': 0, '1': 1, '2': 2, '3': 3, '4': 4, '5': 5, '6': 6, '7': 7, '8': 8, '9': 9}, lastWasNum = false;
|
||||
i = 0;
|
||||
while (!!(c = chars[i])) {
|
||||
if (numb3rs.hasOwnProperty(c)) {
|
||||
r.push(c);
|
||||
i += 2;
|
||||
lastWasNum = true;
|
||||
} else {
|
||||
r.push('');
|
||||
i += 1;
|
||||
lastWasNum = false;
|
||||
}
|
||||
}
|
||||
if (!lastWasNum) {
|
||||
r.push('');
|
||||
}
|
||||
return r;
|
||||
};
|
||||
} else {
|
||||
return function (pattern) {
|
||||
return pattern.split(/\D/);
|
||||
};
|
||||
}
|
||||
}());
|
||||
|
||||
for (size in patternObject) {
|
||||
if (patternObject.hasOwnProperty(size)) {
|
||||
patterns = patternObject[size].match(new RegExp('.{1,' + (+size) + '}', 'g'));
|
||||
i = 0;
|
||||
while (!!(pattern = patterns[i++])) {
|
||||
chars = pattern.replace(/[\d]/g, '').split('');
|
||||
points = getPoints(pattern);
|
||||
t = tree;
|
||||
|
||||
j = 0;
|
||||
while (!!(c = chars[j++])) {
|
||||
codePoint = c.charCodeAt(0);
|
||||
|
||||
if (!t[codePoint]) {
|
||||
t[codePoint] = {};
|
||||
}
|
||||
t = t[codePoint];
|
||||
}
|
||||
|
||||
t.tpoints = [];
|
||||
for (k = 0; k < points.length; k++) {
|
||||
p = points[k];
|
||||
t.tpoints.push((p == "") ? 0 : p);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
Hyphenator.languages[lang].patterns = tmp;
|
||||
Hyphenator.languages[lang].patternsConverted = true;
|
||||
Hyphenator.languages[lang].patterns = tree;
|
||||
/**
|
||||
* end of BSD licenced code from hypher.js
|
||||
*/
|
||||
},
|
||||
|
||||
recreatePattern = function (pattern, nodePoints) {
|
||||
var r = [], c = pattern.split(''), i;
|
||||
for (i = 0; i < nodePoints.length; i++) {
|
||||
if (nodePoints[i] !== 0) {
|
||||
r.push(nodePoints[i]);
|
||||
}
|
||||
if (c[i]) {
|
||||
r.push(c[i]);
|
||||
}
|
||||
}
|
||||
return r.join('');
|
||||
},
|
||||
|
||||
/**
|
||||
* @name Hyphenator-convertExceptionsToObject
|
||||
* @description
|
||||
@ -1112,7 +1292,7 @@ var Hyphenator = (function (window) {
|
||||
lo.exceptions = {};
|
||||
}
|
||||
convertPatterns(lang);
|
||||
wrd = '[\\w' + lo.specialChars + '@' + String.fromCharCode(173) + '-]{' + min + ',}';
|
||||
wrd = '[\\w' + lo.specialChars + '@' + String.fromCharCode(173) + String.fromCharCode(8204) + '-]{' + min + ',}';
|
||||
lo.genRegExp = new RegExp('(' + url + ')|(' + mail + ')|(' + wrd + ')', 'gi');
|
||||
lo.prepared = true;
|
||||
}
|
||||
@ -1136,7 +1316,6 @@ var Hyphenator = (function (window) {
|
||||
* by repeatedly checking Hyphenator.languages. If a patterfile is loaded the patterns are
|
||||
* converted to their object style and the lang-object extended.
|
||||
* Finally the callback is called.
|
||||
* @param {function()} callback to call, when all patterns are loaded
|
||||
* @private
|
||||
*/
|
||||
prepare = function (callback) {
|
||||
@ -1148,7 +1327,7 @@ var Hyphenator = (function (window) {
|
||||
}
|
||||
}
|
||||
state = 2;
|
||||
callback();
|
||||
callback('*');
|
||||
return;
|
||||
}
|
||||
// get all languages that are used and preload the patterns
|
||||
@ -1176,23 +1355,18 @@ var Hyphenator = (function (window) {
|
||||
delete exceptions[lang];
|
||||
}
|
||||
//Replace genRegExp since it may have been changed:
|
||||
tmp1 = '[\\w' + Hyphenator.languages[lang].specialChars + '@' + String.fromCharCode(173) + '-]{' + min + ',}';
|
||||
tmp1 = '[\\w' + Hyphenator.languages[lang].specialChars + '@' + String.fromCharCode(173) + String.fromCharCode(8204) + '-]{' + min + ',}';
|
||||
Hyphenator.languages[lang].genRegExp = new RegExp('(' + url + ')|(' + mail + ')|(' + tmp1 + ')', 'gi');
|
||||
|
||||
delete docLanguages[lang];
|
||||
callback(lang);
|
||||
continue;
|
||||
} else {
|
||||
loadPatterns(lang);
|
||||
}
|
||||
}
|
||||
}
|
||||
// if all patterns are loaded from storage: callback
|
||||
if (countObjProps(docLanguages) === 0) {
|
||||
state = 2;
|
||||
callback();
|
||||
return;
|
||||
}
|
||||
// else async wait until patterns are loaded, then callback
|
||||
// else async wait until patterns are loaded, then hyphenate
|
||||
interval = window.setInterval(function () {
|
||||
var finishedLoading = true, lang;
|
||||
for (lang in docLanguages) {
|
||||
@ -1202,6 +1376,7 @@ var Hyphenator = (function (window) {
|
||||
delete docLanguages[lang];
|
||||
//do conversion while other patterns are loading:
|
||||
prepareLanguagesObj(lang);
|
||||
callback(lang);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -1209,7 +1384,6 @@ var Hyphenator = (function (window) {
|
||||
//console.log('callig callback for ' + contextWindow.location.href);
|
||||
window.clearInterval(interval);
|
||||
state = 2;
|
||||
callback();
|
||||
}
|
||||
}, 100);
|
||||
},
|
||||
@ -1218,7 +1392,6 @@ var Hyphenator = (function (window) {
|
||||
* @name Hyphenator-switchToggleBox
|
||||
* @description
|
||||
* Creates or hides the toggleBox: a small button to turn off/on hyphenation on a page.
|
||||
* @param {boolean} s true when hyphenation is on, false when it's off
|
||||
* @see Hyphenator.config
|
||||
* @private
|
||||
*/
|
||||
@ -1255,6 +1428,7 @@ var Hyphenator = (function (window) {
|
||||
}
|
||||
},
|
||||
|
||||
|
||||
/**
|
||||
* @name Hyphenator-hyphenateWord
|
||||
* @description
|
||||
@ -1269,8 +1443,10 @@ var Hyphenator = (function (window) {
|
||||
* @public
|
||||
*/
|
||||
hyphenateWord = function (lang, word) {
|
||||
var lo = Hyphenator.languages[lang],
|
||||
parts, i, l, w, wl, s, hypos, p, maxwins, win, pat = false, patk, c, t, n, numb3rs, inserted, hyphenatedword, val;
|
||||
var lo = Hyphenator.languages[lang], parts, l, subst,
|
||||
w, characters, originalCharacters, wordLength, i, j, k, node, points = [],
|
||||
characterPoints = [], nodePoints, nodePointsLength, m = Math.max, trie,
|
||||
result = [''], pattern;
|
||||
if (word === '') {
|
||||
return '';
|
||||
}
|
||||
@ -1292,62 +1468,66 @@ var Hyphenator = (function (window) {
|
||||
}
|
||||
return parts.join('-');
|
||||
}
|
||||
//finally the core hyphenation algorithm
|
||||
w = '_' + word + '_';
|
||||
wl = w.length;
|
||||
s = w.split('');
|
||||
if (word.indexOf("'") !== -1) {
|
||||
w = w.toLowerCase().replace("'", "’"); //replace APOSTROPHE with RIGHT SINGLE QUOTATION MARK (since the latter is used in the patterns)
|
||||
} else {
|
||||
w = w.toLowerCase();
|
||||
w = word = '_' + word + '_';
|
||||
if (!!lo.charSubstitution) {
|
||||
for (subst in lo.charSubstitution) {
|
||||
if (lo.charSubstitution.hasOwnProperty(subst)) {
|
||||
w = w.replace(new RegExp(subst, 'g'), lo.charSubstitution[subst]);
|
||||
}
|
||||
}
|
||||
}
|
||||
hypos = [];
|
||||
numb3rs = {'0': 0, '1': 1, '2': 2, '3': 3, '4': 4, '5': 5, '6': 6, '7': 7, '8': 8, '9': 9}; //check for member is faster then isFinite()
|
||||
n = wl - lo.shortestPattern;
|
||||
for (p = 0; p <= n; p++) {
|
||||
maxwins = Math.min((wl - p), lo.longestPattern);
|
||||
for (win = lo.shortestPattern; win <= maxwins; win++) {
|
||||
if (lo.patterns.hasOwnProperty(patk = w.substring(p, p + win))) {
|
||||
pat = lo.patterns[patk];
|
||||
if (enableReducedPatternSet && (typeof pat === 'string')) {
|
||||
lo.redPatSet[patk] = pat;
|
||||
if (word.indexOf("'") !== -1) {
|
||||
w = w.replace("'", "’"); //replace APOSTROPHE with RIGHT SINGLE QUOTATION MARK (since the latter is used in the patterns)
|
||||
}
|
||||
/** @license BSD licenced code
|
||||
* The following code is based on code from hypher.js
|
||||
* Copyright (c) 2011, Bram Stein
|
||||
*/
|
||||
characters = w.toLowerCase().split('');
|
||||
originalCharacters = word.split('');
|
||||
wordLength = characters.length;
|
||||
trie = lo.patterns;
|
||||
for (i = 0; i < wordLength; i += 1) {
|
||||
points[i] = 0;
|
||||
characterPoints[i] = characters[i].charCodeAt(0);
|
||||
}
|
||||
for (i = 0; i < wordLength; i += 1) {
|
||||
pattern = '';
|
||||
node = trie;
|
||||
for (j = i; j < wordLength; j += 1) {
|
||||
node = node[characterPoints[j]];
|
||||
if (node) {
|
||||
if (enableReducedPatternSet) {
|
||||
pattern += String.fromCharCode(characterPoints[j]);
|
||||
}
|
||||
if (typeof pat === 'string') {
|
||||
//convert from string 'a5b' to array [1,5] (pos,value)
|
||||
t = 0;
|
||||
val = [];
|
||||
for (i = 0; i < pat.length; i++) {
|
||||
if (!!(c = numb3rs[pat.charAt(i)])) {
|
||||
val.push(i - t, c);
|
||||
t++;
|
||||
nodePoints = node.tpoints;
|
||||
if (nodePoints) {
|
||||
if (enableReducedPatternSet) {
|
||||
if (!lo.redPatSet) {
|
||||
lo.redPatSet = {};
|
||||
}
|
||||
lo.redPatSet[pattern] = recreatePattern(pattern, nodePoints);
|
||||
}
|
||||
for (k = 0, nodePointsLength = nodePoints.length; k < nodePointsLength; k += 1) {
|
||||
points[i + k] = m(points[i + k], nodePoints[k]);
|
||||
}
|
||||
pat = lo.patterns[patk] = val;
|
||||
}
|
||||
} else {
|
||||
continue;
|
||||
}
|
||||
for (i = 0; i < pat.length; i++) {
|
||||
c = p - 1 + pat[i];
|
||||
if (!hypos[c] || hypos[c] < pat[i + 1]) {
|
||||
hypos[c] = pat[i + 1];
|
||||
}
|
||||
i++;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
inserted = 0;
|
||||
for (i = lo.leftmin; i <= (word.length - lo.rightmin); i++) {
|
||||
if (!!(hypos[i] & 1)) {
|
||||
s.splice(i + inserted + 1, 0, hyphen);
|
||||
inserted++;
|
||||
for (i = 1; i < wordLength - 1; i += 1) {
|
||||
if (i > lo.leftmin && i < (wordLength - lo.rightmin) && points[i] % 2) {
|
||||
result.push(originalCharacters[i]);
|
||||
} else {
|
||||
result[result.length - 1] += originalCharacters[i];
|
||||
}
|
||||
}
|
||||
hyphenatedword = s.slice(1, -1).join('');
|
||||
if (enableCache) {
|
||||
lo.cache[word] = hyphenatedword;
|
||||
}
|
||||
return hyphenatedword;
|
||||
return result.join(hyphen);
|
||||
/**
|
||||
* end of BSD licenced code from hypher.js
|
||||
*/
|
||||
},
|
||||
|
||||
/**
|
||||
@ -1425,13 +1605,18 @@ var Hyphenator = (function (window) {
|
||||
}
|
||||
//create a hidden shadow element
|
||||
shadow = currDoc.createElement('div');
|
||||
shadow.style.overflow = 'hidden';
|
||||
shadow.style.position = 'absolute';
|
||||
shadow.style.top = '-5000px';
|
||||
shadow.style.height = '1px';
|
||||
//Moving the element out of the screen doesn't work for IE9 (https://connect.microsoft.com/IE/feedback/details/663981/)
|
||||
//shadow.style.overflow = 'hidden';
|
||||
//shadow.style.position = 'absolute';
|
||||
//shadow.style.top = '-5000px';
|
||||
//shadow.style.height = '1px';
|
||||
//doing this instead:
|
||||
shadow.style.color = window.getComputedStyle ? targetWindow.getComputedStyle(body).backgroundColor : '#FFFFFF';
|
||||
shadow.style.fontSize = '0px';
|
||||
body.appendChild(shadow);
|
||||
if (!!window.getSelection) {
|
||||
//FF3, Webkit
|
||||
//FF3, Webkit, IE9
|
||||
e.stopPropagation();
|
||||
selection = targetWindow.getSelection();
|
||||
range = selection.getRangeAt(0);
|
||||
shadow.appendChild(range.cloneContents());
|
||||
@ -1439,10 +1624,12 @@ var Hyphenator = (function (window) {
|
||||
selection.selectAllChildren(shadow);
|
||||
restore = function () {
|
||||
shadow.parentNode.removeChild(shadow);
|
||||
selection.removeAllRanges(); //IE9 needs that
|
||||
selection.addRange(range);
|
||||
};
|
||||
} else {
|
||||
// IE
|
||||
// IE<9
|
||||
e.cancelBubble = true;
|
||||
selection = targetWindow.document.selection;
|
||||
range = selection.createRange();
|
||||
shadow.innerHTML = range.htmlText;
|
||||
@ -1464,12 +1651,59 @@ var Hyphenator = (function (window) {
|
||||
}
|
||||
el = el || body;
|
||||
if (window.addEventListener) {
|
||||
el.addEventListener("copy", oncopyHandler, false);
|
||||
el.addEventListener("copy", oncopyHandler, true);
|
||||
} else {
|
||||
el.attachEvent("oncopy", oncopyHandler);
|
||||
}
|
||||
},
|
||||
|
||||
/**
|
||||
* @name Hyphenator-unhideElement
|
||||
* @description
|
||||
* Unhides an element and removes the visibility attr if set by hyphenator
|
||||
* @param Object The Element object from ElementCollection
|
||||
* @private
|
||||
*/
|
||||
unhideElement = function (elo) {
|
||||
var el = elo.element,
|
||||
hyphenatorSettings = elo.data;
|
||||
el.style.visibility = 'visible';
|
||||
elo.data.isHidden = false;
|
||||
if (!hyphenatorSettings.hasOwnStyle) {
|
||||
el.setAttribute('style', ''); // without this, removeAttribute doesn't work in Safari (thanks to molily)
|
||||
el.removeAttribute('style');
|
||||
} else {
|
||||
if (el.style.removeProperty) {
|
||||
el.style.removeProperty('visibility');
|
||||
} else if (el.style.removeAttribute) { // IE
|
||||
el.style.removeAttribute('visibility');
|
||||
}
|
||||
}
|
||||
},
|
||||
|
||||
/**
|
||||
* @name Hyphenator-checkIfAllDone
|
||||
* @description
|
||||
* Checks if all Elements are hyphenated, unhides them and fires onHyphenationDone()
|
||||
* @private
|
||||
*/
|
||||
checkIfAllDone = function () {
|
||||
var allDone = true;
|
||||
elements.each(function (lang, list) {
|
||||
var i, l = list.length;
|
||||
for (i = 0; i < l; i++) {
|
||||
allDone = allDone && list[i].hyphenated;
|
||||
if (intermediateState === 'hidden' && unhide === 'wait') {
|
||||
unhideElement(list[i]);
|
||||
}
|
||||
}
|
||||
});
|
||||
if (allDone) {
|
||||
state = 3;
|
||||
onHyphenationDone();
|
||||
}
|
||||
},
|
||||
|
||||
|
||||
/**
|
||||
* @name Hyphenator-hyphenateElement
|
||||
@ -1480,9 +1714,10 @@ var Hyphenator = (function (window) {
|
||||
* @param {Object} el The element to hyphenate
|
||||
* @private
|
||||
*/
|
||||
hyphenateElement = function (el) {
|
||||
var hyphenatorSettings = Expando.getDataForElem(el),
|
||||
lang = hyphenatorSettings.language, hyphenate, n, i,
|
||||
hyphenateElement = function (lang, elo) {
|
||||
var hyphenatorSettings = elo.data,
|
||||
el = elo.element,
|
||||
hyphenate, n, i,
|
||||
controlOrphans = function (part) {
|
||||
var h, r;
|
||||
switch (hyphen) {
|
||||
@ -1534,52 +1769,51 @@ var Hyphenator = (function (window) {
|
||||
}
|
||||
}
|
||||
}
|
||||
if (hyphenatorSettings.isHidden && intermediateState === 'hidden') {
|
||||
el.style.visibility = 'visible';
|
||||
if (!hyphenatorSettings.hasOwnStyle) {
|
||||
el.setAttribute('style', ''); // without this, removeAttribute doesn't work in Safari (thanks to molily)
|
||||
el.removeAttribute('style');
|
||||
} else {
|
||||
if (el.style.removeProperty) {
|
||||
el.style.removeProperty('visibility');
|
||||
} else if (el.style.removeAttribute) { // IE
|
||||
el.style.removeAttribute('visibility');
|
||||
}
|
||||
}
|
||||
if (hyphenatorSettings.isHidden && intermediateState === 'hidden' && unhide === 'progressive') {
|
||||
unhideElement(elo);
|
||||
}
|
||||
if (hyphenatorSettings.isLast) {
|
||||
state = 3;
|
||||
documentCount--;
|
||||
if (documentCount > (-1000) && documentCount <= 0) {
|
||||
documentCount = (-2000);
|
||||
onHyphenationDone();
|
||||
}
|
||||
elo.hyphenated = true;
|
||||
elements.hyCount += 1;
|
||||
if (elements.count <= elements.hyCount) {
|
||||
checkIfAllDone();
|
||||
}
|
||||
},
|
||||
|
||||
|
||||
/**
|
||||
* @name Hyphenator-hyphenateDocument
|
||||
* @name Hyphenator-hyphenateLanguageElements
|
||||
* @description
|
||||
* Calls hyphenateElement() for all members of elements. This is done with a setTimout
|
||||
* Calls hyphenateElement() for all elements of the specified language.
|
||||
* If the language is '*' then all elements are hyphenated.
|
||||
* This is done with a setTimout
|
||||
* to prevent a "long running Script"-alert when hyphenating large pages.
|
||||
* Therefore a tricky bind()-function was necessary.
|
||||
* @private
|
||||
*/
|
||||
hyphenateDocument = function () {
|
||||
function bind(fun, arg) {
|
||||
hyphenateLanguageElements = function (lang) {
|
||||
function bind(fun, arg1, arg2) {
|
||||
return function () {
|
||||
return fun(arg);
|
||||
return fun(arg1, arg2);
|
||||
};
|
||||
}
|
||||
var i = 0, el;
|
||||
while (!!(el = elements[i++])) {
|
||||
if (el.ownerDocument.location.href === contextWindow.location.href) {
|
||||
window.setTimeout(bind(hyphenateElement, el), 0);
|
||||
var el, i, l;
|
||||
if (lang === '*') {
|
||||
elements.each(function (lang, langels) {
|
||||
var i, l = langels.length;
|
||||
for (i = 0; i < l; i++) {
|
||||
window.setTimeout(bind(hyphenateElement, lang, langels[i]), 0);
|
||||
}
|
||||
});
|
||||
} else {
|
||||
if (elements.list.hasOwnProperty(lang)) {
|
||||
l = elements.list[lang].length;
|
||||
for (i = 0; i < l; i++) {
|
||||
window.setTimeout(bind(hyphenateElement, lang, elements.list[lang][i]), 0);
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
|
||||
|
||||
/**
|
||||
* @name Hyphenator-removeHyphenationFromDocument
|
||||
* @description
|
||||
@ -1587,10 +1821,13 @@ var Hyphenator = (function (window) {
|
||||
* @private
|
||||
*/
|
||||
removeHyphenationFromDocument = function () {
|
||||
var i = 0, el;
|
||||
while (!!(el = elements[i++])) {
|
||||
removeHyphenationFromElement(el);
|
||||
}
|
||||
elements.each(function (lang, elo) {
|
||||
var i, l = elo.length, el;
|
||||
for (i = 0; i < l; i++) {
|
||||
removeHyphenationFromElement(elo[i].element);
|
||||
elo[i].hyphenated = false;
|
||||
}
|
||||
});
|
||||
state = 4;
|
||||
},
|
||||
|
||||
@ -1687,7 +1924,7 @@ var Hyphenator = (function (window) {
|
||||
* minor release: new languages, improvements
|
||||
* @public
|
||||
*/
|
||||
version: 'X.Y.Z',
|
||||
version: '4.0.0',
|
||||
|
||||
/**
|
||||
* @name Hyphenator.doHyphenation
|
||||
@ -1892,6 +2129,16 @@ var Hyphenator = (function (window) {
|
||||
defaultLanguage = obj[key];
|
||||
}
|
||||
break;
|
||||
case 'useCSS3hyphenation':
|
||||
if (assert('useCSS3hyphenation', 'boolean')) {
|
||||
css3 = obj[key];
|
||||
}
|
||||
break;
|
||||
case 'unhide':
|
||||
if (assert('unhide', 'string')) {
|
||||
unhide = obj[key];
|
||||
}
|
||||
break;
|
||||
default:
|
||||
onError(new Error('Hyphenator.config: property ' + key + ' not known.'));
|
||||
}
|
||||
@ -1923,7 +2170,7 @@ var Hyphenator = (function (window) {
|
||||
autoSetMainLanguage(undefined);
|
||||
gatherDocumentInfos();
|
||||
//console.log('preparing for ' + contextWindow.location.href);
|
||||
prepare(hyphenateDocument);
|
||||
prepare(hyphenateLanguageElements);
|
||||
if (displayToggleBox) {
|
||||
toggleBox();
|
||||
}
|
||||
@ -2022,8 +2269,7 @@ var Hyphenator = (function (window) {
|
||||
if (n.nodeType === 3 && n.data.length >= min) { //type 3 = #text -> hyphenate!
|
||||
n.data = n.data.replace(Hyphenator.languages[lang].genRegExp, hyphenate);
|
||||
} else if (n.nodeType === 1) {
|
||||
// Modified by Kovid to use element lang only if it has been loaded
|
||||
if (n.lang !== '' && Hyphenator.languages.hasOwnProperty(n.lang)) {
|
||||
if (n.lang !== '') {
|
||||
Hyphenator.hyphenate(n, n.lang);
|
||||
} else {
|
||||
Hyphenator.hyphenate(n, lang);
|
||||
@ -2115,7 +2361,7 @@ var Hyphenator = (function (window) {
|
||||
storeConfiguration();
|
||||
toggleBox();
|
||||
} else {
|
||||
hyphenateDocument();
|
||||
hyphenateLanguageElements('*');
|
||||
Hyphenator.doHyphenation = true;
|
||||
storeConfiguration();
|
||||
toggleBox();
|
||||
@ -2140,4 +2386,4 @@ if (Hyphenator.isBookmarklet()) {
|
||||
Hyphenator.config({displaytogglebox: true, intermediatestate: 'visible', doframes: true});
|
||||
Hyphenator.config(Hyphenator.getConfigFromURI());
|
||||
Hyphenator.run();
|
||||
}
|
||||
}
|
BIN
resources/viewer/hyphenate/patterns.zip
Normal file
BIN
resources/viewer/hyphenate/patterns.zip
Normal file
Binary file not shown.
File diff suppressed because one or more lines are too long
@ -1,13 +0,0 @@
|
||||
// For questions about the Bengali hyphenation patterns
|
||||
// ask Santhosh Thottingal (santhosh dot thottingal at gmail dot com)
|
||||
Hyphenator.languages['bn'] = {
|
||||
leftmin : 2,
|
||||
rightmin : 2,
|
||||
shortestPattern : 1,
|
||||
longestPattern : 1,
|
||||
specialChars : unescape("আঅইঈউঊঋএঐঔকগখঘঙচছজঝঞটঠডঢণতথদধনপফবভমযরলশষসহিীাুূৃোোৈৌৗ্ঃং%u200D"),
|
||||
patterns : {
|
||||
2 : "অ1আ1ই1ঈ1উ1ঊ1ঋ1এ1ঐ1ঔ1ি1া1ী1ু1ৃ1ে1ো1ৌ1ৗ1্2ঃ1ং11ক1গ1খ1ঘ1ঙ1চ1ছ1জ1ঝ1ঞ1ট1ঠ1ড1ঢ1ণ1ত1থ1দ1ধ1ন1প1ফ1ব1ভ1ম1য1র1ল1শ1ষ1স1হ",
|
||||
3 : "2ঃ12ং1"
|
||||
}
|
||||
};
|
File diff suppressed because one or more lines are too long
@ -1,16 +0,0 @@
|
||||
Hyphenator.languages['da'] = {
|
||||
leftmin : 2,
|
||||
rightmin : 2,
|
||||
shortestPattern : 2,
|
||||
longestPattern : 8,
|
||||
specialChars : "æøå",
|
||||
patterns : {
|
||||
3 : "a3ca1ea3ha3ja5oa5z1ba4bd1be1bib1j1bo4bsb5t3bub5w1by1ce3chck35cy3dad1b1ded1fd1gd3h1did3jd1kd1ld1m3dod1p1dud1v3dye3ee1he5x1faf1bf1d1fef1ff1gf1h1fif1k3fl1fof1p4ft1fuf1v3fy1gag1bg1d1geg3fg1gg1h1gi5gjg3kg1lg1m3gog3p1grg3v1gyi1ai3bi1ci3hi5ii5ji1uj5kj3rk5bk3hk1kk1tl1bl1fl3hl3jl1ll3r4ls1mam1bm3d1mem3fm1gm3h1mim3km1lm1mm1n3mom1r3my3nan1bn1c4nd1nen1f1nin1mn1n1non5pn3r4ns3nyn3zo3ao1co1eo5ho1jo3t3pap3dp3fp3mp3np1t1pup5vqu4r1br1fr1hr1lr1nr3pr1rs1d1ses1fs1msp44tbt1ht1mt1n4tsu1au1eu3iu5qv5hv5jv5kvl41vov5pv5t3vuy3ay3ey5o5bæ3dæ3døe3æe5å3fæ3fø3gæ3gåi3ø3kø3kå1mæ3mø3må3næ5nøo5åpå31sæ1sø5våæ3cæ3eæ5iæ5oø3eå1då1eå5hå3lå3t",
|
||||
4 : "_ae3_om1_po15adg5afgaf3r5afsa4gia4gya5kaa3kea5kraku5a3laa1lea1lial3ka1loa3lua1lya3nu3anva5pea3pia5poa1ra1arba1re5arga1ria3roa3saa3sca1sia3ska3soa1tea1tia1toa5tra1tua5vaa1vebe1k4b1n1br4bs5kb3sob1stby5s4c1c4ch_ci4oda4sd1d4de5ddi1edi5l4d1n4dopd5ovd5rud4smd4sud3tad1tedt5od5trdt5udub5e5ade3afe5age3ake1ale3ane5ape3ate3blebs3e1cie4do3effe3fr3efte3gue3inei5se3jee1kae3kee3kle5kre3kue1kve5kye3lee1lie3loe5lue3lyem1s4enne4noe5nue5ole3ope1ore3ovepi3e1pre3rae1ree1rier1ker3se5rye1tae1tee1tie3tje1toe3tre3tue1tye3ume3un3eure1vae3vee1vifej4f1s4f3taf1tef1tif5toge3sgi4bg5ovgs1ag4segs1pgs1vg3tag1teg1tig5tog3trgt4sg3udgun5g5yd4ha_he5s4hethi4ehi3s4h3thun4hvo4i3dri1eli1eni3erif3ri3gui1kai1keik1li5koi3kuik3vi3liil3ki1loil5ui3mu5infin3si3nui3odi3ogi5oki3olion4i3oti5pii5pri3rei3riir5ti3sci3sii4smis3pi1tai1tei1tii3toi3tri1tui3tyi1vai1vei1vij3agjds1j3lej3lijre5ju3s5kapk5au5kavki3ek1le3kluk4ny5kod1konko3v1kra5kryk1siks3kks1pks5vkt5s3kur1kus3kutk4vok4vu5lab5lam1latl3dr1le_5led3len1ler1les4leuli5ol1kel1kol3kyl5mul3op3lov4l3pl4psl5sjl1tal1tel3tilt3ol3trl3tulu5ll3vel3vimi3kmi4o4mopm1pem3pim3plm1pom3prm5skms3pms5vm3tam3tem3tim3trm1ud1mul4nak1naln3drne5aneo4n4go4n1h4nimni5on1ken1kon3krn3kun5kv4n1ln3sin1tan1ten1tin3ton1trn3tun3ty4n1vo4asod5sof5ro5ino3kao1keo3kuo3lao3leo1lio1loo3luo5ly1omron3kook5o3oro5ovo3piop3lop3rop3s4or_o3rior3kor5oo3sio3soo1teo5unov4s4pec3pen1perpe5spe3u4p5h1pla5pok3potp4rop3skp5sops4pp3stpu5b5py34rafr3dr1relr1guri1er3kar1ker1kir3kurmo4r5muro1bro3pr3orr1sar1sirs4nr3spr5sur3svr1ter1tir3tort3sr5tyr3ud5rutr3var1ver3viry4ss3af1sams3aps1ar1sat4s1bsdy4s4ed4s3h1sig5sis5sit5sius5ju4sk_1skes3kl5skys1les1lislo35slus5lys4myso5k5sol3sons1pls5r4s1s44st_5stj3sto1strs1ud3suls3un3surs3ve3s4y5ta_1tag3tegteo14t1f6t3g3tid4t3k4t1l4t3pt4ra1tryt3sit3st4t1t5turt5ve1typ5udlud5rud3s3udvugs3u5guu5klu1lau1leu5lyu5peup5lu3rau3reu3rous5au3siu5sous5vu1teu1tiu1tout5r5u5vva5d1vedve3s5vet1visv3lev5livls1v5rev3stv5suy5dry3key5kiy3koy3kvy5liy5loy5muyns5y1pey3piy3rey3riy3siy3tiy5vezi5o_så3a3tøa5væe3læe3løe3røe5tæe5tøe1vægiø4g4søg5så3gø1i5tæl3væ5løsm5tån3kæn5tæo5læor3ø5præ5pædr5kær5tær5tør3vær5æl4røn5rør3rådr5års4kå3slås4næ5stø1stås5økti4øt4søt5såt3væu3læy5vææb3læg5aægs5æ5kvæ1reæ3riær5sæ5siæ3soæ3veøde5ø1jeø3keø3leøms5ø1reø3riør5oø1veå3reå5sk",
|
||||
5 : "_an3k_an1s_be1t_her3_ove4_til3_yd5rab5le3abstaf4ria4gefag5inag5si3agtiais5t4alkval5siam4paar5af3a3spa3stea3stia1ta1ato5vba4tibe3robe5rube1s4be1trbi5skbo4grbo3rabo5rece5ro4d3afde5sk3drif3drivd5rosds5ands5ind1skidsu5lds5viea4laed5aredde4ed5raed3re4e1koek5sa3ekspe3ladel3akel3are1lase4lek3elem5elimel5sae4maden5ake4nanen3soer3afe4rage4rake4ref5erhve4ribero5der5over5tre3rumer5unfa4cefags3fejl1fo4rif5tvig3artgi3st4g5omgsha4g5slags3org4strheds3hi4n5ho5koho5vehund3i4bleids5ki3et_ik3reik5riiks5tik4tui3lagil3egil5ejil5elind3tings1in4svions1i5o5ri3plii3stii5suakel5ske3skke5stki3stk5lakko3ra3kortks3ank3stek5stuk4tarkti4ekt5relad3r5lagdld3st4lelele4molfin4l1go1li4galo4du4l5orlses1ls5inl4taf4m5ejm5ingmmen5mo4da4m5ovmse5sms5inm3stemu1lind5sind5sknd5spne4dan3erkn5erlne5slne5stni3stn3ordn1skuns3pon1stan5stint4suob3lio4dinod5riod5uno4geko4gelo4g5oog5reog5sk3optaor1an3ordnord5so3re_o3rego3reko3rero3retor5imor3slor3stpa5ghp5anlpe1rap4lan4ple_4pler4ples4p5p41procp5ulera5is4rarbrd4s34reksre5la5rese4ressre3st5rettri5la4rimor4ing4rinp4rintrk3sorre5sr5skrr5stur5talrt3rer5trir5trosa4ma5s4erse4se4s1g4si4bls5int1skabsk5s44snins4nit5som_3somms5oms5somt4s1op3spec4sper3s4pi1stanst5as3stat1stav1ste_1sted3stel1sten5step3stes5stetst5om1sy1s4tanvteds55tekn5termte5roti4enti3stto5rato1reto1ritor4m4trestro5vts4pats5prts5ult5udsue4t5uk4tauk4tru1reru5skaut5s43varm4v5omyk3liyk4s5yr3eky5t3r_ær5i_øv3rbrød35drøvdstå4er5øn4n5æb4s5ænså4r53værd1værkæ4gekæ4g5rælle4æn1drær4maær4moæ3steøn3støn4t3ørne3års5t",
|
||||
6 : "_be5la_bi4tr_der3i_ne4t5ade5la5a4f1l3analyan4k5ra4t5ind5antade4rig4dretteddel5ed3rined4stre4j5el3eksemer5egeetek4sfor1enger3ini4l5idinter1i1sterit5re_jek4to4j5en_ke4t5ak4terhla4g3rlfind54l5ins4l3intmi5stynemen4nta4lent4s5toi6s5eo3re3so4r5in4po3rer4d5arr5enssre5spore5s4urro4n5r4sk5vr4telir4t5orrt5ratrun4da5s4tam5stemo3ster_tands3tede4ltli4s5uge4riu4r3egve4l5eve4reg3træk_ær4g5r",
|
||||
7 : "_hoved3bu4s5tr4de4lemder5eri5d4reveem4p5lee4v3erfjde4rerjlmeld5l4t5erfntiali43orientringse43sprog_vi4l3infø4r5en",
|
||||
8 : "_diagno54g5endengsde4leng5s4tidejlmel4dinement5eringso4rtialis5t",
|
||||
9 : "ldiagnos5lingeniø4"
|
||||
}
|
||||
};
|
File diff suppressed because one or more lines are too long
@ -1,20 +0,0 @@
|
||||
// Hyphenation patterns for Modern Monotonic Greek.
|
||||
// Created by Dimitrios Filippou with some ideas borrowed from
|
||||
// Yannis Haralambous, Kostis Dryllerakis and Claudio Beccari.
|
||||
// From http://tug.org/svn/texhyphen/branches/ptex/hyph-utf8/tex/generic/hyph-utf8/patterns/tex/hyph-el-monoton.tex
|
||||
// Converted by Pablo Rodríguez (hyphenator at pragmata dot tk)
|
||||
Hyphenator.languages['el-monoton'] = Hyphenator.languages['el'] = {
|
||||
leftmin : 2,
|
||||
rightmin : 2,
|
||||
shortestPattern : 1,
|
||||
longestPattern : 7,
|
||||
specialChars : "αεηιουωϊϋάέήίόύώΐΰίάύέήόώβγκδζθλμπντξρσϲςφχψ'ʼ᾿’᾽",
|
||||
patterns : {
|
||||
2 : "α1ε1η1ι1ο1υ1ω1ϊ1ϋ1ά1έ1ή1ί1ό1ύ1ώ1ΐ1ΰ14'4ʼ4᾿",
|
||||
3 : "α2ια2ία2ίά2ιά2ιά2ϊά2ϊα2υα2ύα2ύά3υά3υε2ιε2ίε2ίέ2ιέ2ιέ2ϊέ2ϊε2υε2ύε2ύέ3υέ3υη2υη2ύη2ύή3υή3υο2ιο2ίο2ίό2ιό2ιό2ϊό2ϊο2υο2ύο2ύό3υό3υυ2ιυ2ίυ2ίύ3ιύ3ια2ηα2ϊα2ϋε2ϊε2ϋο2ηο2ϊι2αι2άι2άι2ει2έι2έι2οι2όι2όι2ωι2ώι2ώ_ι3_ί3_ί3η2αη2άη2άη2εη2έη2έη2οη2όη2όη2ωη2ώη2ώ_η3_ή3_ή3υ2αυ2άυ2άυ2ου2όυ2όυ2ωυ2ώυ2ώ_υ3_ύ3_ύ34β_4γ_4δ_4ζ_4θ_4κ_4λ_4μ_4ν_4ξ_4π_4ρ_4σ_4ϲ_4ς_4τ_4φ_4χ_4ψ_4β'4βʼ4β᾿4γ'4γʼ4γ᾿4δ'4δʼ4δ᾿4ζ'4ζʼ4ζ᾿4θ'4θʼ4θ᾿4κ'4κʼ4κ᾿4λ'4λʼ4λ᾿4μ'4μʼ4μ᾿4ν'4νʼ4ν᾿4ξ'4ξʼ4ξ᾿4π'4πʼ4π᾿4ρ'4ρʼ4ρ᾿4σ'4σʼ4σ᾿4ϲ'4ϲʼ4ϲ᾿4τ'4τʼ4τ᾿4φ'4φʼ4φ᾿4χ'4χʼ4χ᾿4ψ'4ψʼ4ψ᾿_β4_γ4_δ4_ζ4_θ4_κ4_λ4_μ4_ν4_ξ4_π4_ρ4_σ4_ϲ4_τ4_φ4_χ4_ψ4",
|
||||
4 : "ά3η_ά3η_ά3ι_ά3ι_ο2ειό3η_ό3η_ό3ι_ό3ι_4γκ_4μπ_4ντ_4τζ_4τσ_4τϲ_4τς_4μπ'4μπʼ4μπ᾿4ντ'4ντ’4ντ᾿4τζ'4τζʼ4τζ᾿4τσ'4τσʼ4τσ᾽4τϲ'4τϲʼ4τϲ᾿4β1β4γ1γ4δ1δ4ζ1ζ4θ1θ4κ1κ4λ1λ4μ1μ4ν1ν4π1π4ρ1ρ4σ1σ4ϲ1ϲ4τ1τ4φ1φ4χ1χ4ψ1ψ4β1ζ4β1θ4β1κ4β1μ4β1ν4β1ξ4β1π4β1σ4β1ϲ4β1τ4β1φ4β1χ4β1ψ4γ1β4γ1ζ4γ1θ4γ1μ4γ1ξ4γ1π4γ1σ4γ1ϲ4γ1τ4γ1φ4γ1χ4γ1ψ4δ1β4δ1γ4δ1ζ4δ1θ4δ1κ4δ1λ4δ1ξ4δ1π4δ1σ4δ1ϲ4δ1τ4δ1φ4δ1χ4δ1ψ4ζ1β4ζ1γ4ζ1δ4ζ1θ4ζ1κ4ζ1λ4ζ1μτζ2μ4ζ1ν4ζ1ξ4ζ1π4ζ1ρ4ζ1σ4ζ1ϲ4ζ1τ4ζ1φ4ζ1χ4ζ1ψ4θ1β4θ1γ4θ1δ4θ1ζ4θ1κ4θ1μσθ2μϲθ2μ4θ1ξ4θ1π4θ1σ4θ1ϲ4θ1τ4θ1φ4θ1χ4θ1ψ4κ1β4κ1γ4κ1δ4κ1ζ4κ1θ4κ1μ4κ1ξ4κ1π4κ1σ4κ1ϲ4κ1φ4κ1χ4κ1ψ4λ1β4λ1γ4λ1δ4λ1ζ4λ1θ4λ1κ4λ1μ4λ1ν4λ1ξ4λ1π4λ1ρ4λ1σ4λ1ϲ4λ1τ4λ1φ4λ1χ4λ1ψ4μ1β4μ1γ4μ1δ4μ1ζ4μ1θ4μ1κ4μ1λ4μ1ξ4μ1ρ4μ1σ4μ1ϲ4μ1τ4μ1φ4μ1χ4μ1ψ4ν1β4ν1γ4ν1δ4ν1ζ4ν1θ4ν1κ4ν1λ4ν1μ4ν1ξ4ν1π4ν1ρ4ν1σ4ν1ϲ4ν1φ4ν1χ4ν1ψ4ξ1β4ξ1γ4ξ1δ4ξ1ζ4ξ1θ4ξ1κ4ξ1λ4ξ1μ4ξ1ν4ξ1π4ξ1ρ4ξ1σ4ξ1ϲ4ξ1τ4ξ1φ4ξ1χ4ξ1ψ4π1β4π1γ4π1δ4π1ζ4π1θ4π1κ4π1μ4π1ξ4π1σ4π1ϲ4π1φ4π1χ4π1ψ4ρ1β4ρ1γ4ρ1δ4ρ1ζ4ρ1θ4ρ1κ4ρ1λ4ρ1μ4ρ1ν4ρ1ξ4ρ1π4ρ1σ4ρ1ϲ4ρ1τ4ρ1φ4ρ1χ4ρ1ψ4σ1δ4ϲ1δ4σ1ζ4ϲ1ζ4σ1ν4ϲ1ν4σ1ξ4ϲ1ξ4σ1ρ4ϲ1ρ4σ1ψ4ϲ1ψ4τ1β4τ1γ4τ1δ4τ1θ4τ1κ4τ1ν4τ1ξ4τ1π4τ1φστ2φϲτ2φ4τ1χ4τ1ψ4φ1β4φ1γ4φ1δ4φ1ζ4φ1κ4φ1μ4φ1ν4φ1ξ4φ1π4φ1σ4φ1ϲ4φ1χ4φ1ψ4χ1β4χ1γ4χ1δ4χ1ζ4χ1κ4χ1μ4χ1ξ4χ1π4χ1σ4χ1ϲ4χ1φ4χ1ψ4ψ1β4ψ1γ4ψ1δ4ψ1ζ4ψ1θ4ψ1κ4ψ1λ4ψ1μ4ψ1ν4ψ1ξ4ψ1π4ψ1ρ4ψ1σ4ψ1ϲ4ψ1τ4ψ1φ4ψ1χ4βρ_4γλ_4κλ_4κτ_6κς_6κϲ_4κσ_4λς_4λϲ_4λσ_4μς_4μϲ_4μσ_4νς_4νϲ_4νσ_4ρς_4ρϲ_4ρσ_4σκ_4ϲκ_4στ_4ϲτ_4τλ_4τρ_4φτ_",
|
||||
5 : "ο3ϊ3όο3ϊ3ό4γ1κτ4μ1πτ4ν1τζ4ν1τσ4ν1τϲ4γκς_4γκϲ_4γκσ_4μπλ_4μπν_4μπρ_4ντς_4ντϲ_4ντσ_",
|
||||
6 : "4ρ5γ2μ4ρ5θ2μ4λ5κ2μ4ρ5κ2μ4ν5κ2φ4γ5ξ2τ4ρ5ξ2τ4ρ5φ2ν4ρ5χ2μ4μ5ψ2τ4γ5κ2φ4γκ1ντ4γκ1τζ4γκ1τσ4γκ1τϲ4μπ1ντ4μπ1τζ4μπ1τσ4μπ1τϲ4ντ1μπ4τσ1γκ4τϲ1γκ4τσ1μπ4τϲ1μπ4τσ1ντ4τϲ1ντ",
|
||||
10 : "4χτ_4γκ1μπ"
|
||||
}
|
||||
};
|
@ -1,26 +0,0 @@
|
||||
// Hyphenation patterns for Modern Polytonic Greek.
|
||||
// Created by Dimitrios Filippou with some ideas borrowed from
|
||||
// Yannis Haralambous, Kostis Dryllerakis and Claudio Beccari.
|
||||
// From http://tug.org/svn/texhyphen/branches/ptex/hyph-utf8/tex/generic/hyph-utf8/patterns/tex/hyph-el-polyton.tex
|
||||
// Converted by Pablo Rodríguez (hyphenator at pragmata dot tk)
|
||||
Hyphenator.languages['el-polyton'] = {
|
||||
leftmin : 2,
|
||||
rightmin : 2,
|
||||
shortestPattern : 1,
|
||||
longestPattern : 11,
|
||||
specialChars : "αεηιουωϊϋἀἁἂἃἄἅἆἇἐἑἒἓἔἕἠἡἢἣἤἥἦἧἰἱἲἳἴἵἶἷὀὁὂὃὄὅὐὑὒὓὔὕὖὗὠὡὢὣὤὥὦὧὰὲὴὶὸὺὼᾀᾁᾂᾃᾄᾅᾆᾇᾐᾑᾒᾓᾔᾕᾖᾗᾠᾡᾢᾣᾤᾥᾦᾧᾲᾳᾴᾶᾷῂῃῄῆῇῒῖῗῢῦῧῲῳῴῶῷάέήίόύώΐΰάέήίόύώΐΰβγκδζθλμπντξρσϲςφχψ'ʼ᾿’᾽ῤῥ",
|
||||
patterns : {
|
||||
2 : "α1ε1η1ι1ο1υ1ω1ϊ1ϋ1ἀ1ἁ1ἂ1ἃ1ἄ1ἅ1ἆ1ἇ1ἐ1ἑ1ἒ1ἓ1ἔ1ἕ1ἠ1ἡ1ἢ1ἣ1ἤ1ἥ1ἦ1ἧ1ἰ1ἱ1ἲ1ἳ1ἴ1ἵ1ἶ1ἷ1ὀ1ὁ1ὂ1ὃ1ὄ1ὅ1ὐ1ὑ1ὒ1ὓ1ὔ1ὕ1ὖ1ὗ1ὠ1ὡ1ὢ1ὣ1ὤ1ὥ1ὦ1ὧ1ὰ1ὲ1ὴ1ὶ1ὸ1ὺ1ὼ1ᾀ1ᾁ1ᾂ1ᾃ1ᾄ1ᾅ1ᾆ1ᾇ1ᾐ1ᾑ1ᾒ1ᾓ1ᾔ1ᾕ1ᾖ1ᾗ1ᾠ1ᾡ1ᾢ1ᾣ1ᾤ1ᾥ1ᾦ1ᾧ1ᾲ1ᾳ1ᾴ1ᾶ1ᾷ1ῂ1ῃ1ῄ1ῆ1ῇ1ῒ1ῖ1ῗ1ῢ1ῦ1ῧ1ῲ1ῳ1ῴ1ῶ1ῷ1ά1έ1ή1ί1ό1ύ1ώ1ΐ1ΰ1ά1έ1ή1ί1ό1ύ1ώ1ΐ1ΰ16'6ʼ6᾿",
|
||||
3 : "α2ια2ία2ία2ὶα2ῖα2ἰα2ἴα2ἲα2ἶα2ἱα2ἵα2ἳα2ἷά2ιά2ιά2ϊά2ϊα2υα2ύα2ύα2ὺα2ῦα2ὐα2ὔα2ὒα2ὖα2ὑα2ὕα2ὓα2ὗά3υά3υε2ιε2ίε2ίε2ὶε2ῖε2ἰε2ἴε2ἲε2ἶε2ἱε2ἵε2ἳε2ἷέ2ιέ2ιέ2ϊέ2ϊε2υε2ύε2ύε2ὺε2ῦε2ὐε2ὔε2ὒε2ὖε2ὑε2ὕε2ὓε2ὗέ3υέ3υη2υη2ύη2ύη2ὺη2ῦη2ὐη2ὔη2ὒη2ὖη2ὑη2ὕη2ὓη2ὗο2ιο2ίο2ίο2ὶο2ῖο2ἰο2ἴο2ἲο2ἶο2ἱο2ἵο2ἳο2ἷό2ιό2ιό2ϊό2ϊο2υο2ύο2ύο2ὺο2ῦο2ὐο2ὔο2ὒο2ὖο2ὑο2ὕο2ὓο2ὗό3υό3υυ2ιυ2ίυ2ίυ2ὶυ2ῖυ2ἰυ2ἴυ2ἲυ2ἶυ2ἱυ2ἵυ2ἳυ2ἷα2ηα2ϊα2ϋά3ϋά3ϋε2ηέ2ηέ2ηε2ϊε2ϋό2ηό2ηο2ϊω2ιὠ2ιι2αι2άι2άι2ὰι2ᾶι2ει2έι2έι2ὲι2οι2όι2όι2ὸι2ωι2ώι2ώι2ὼι2ῶ_ί3_ί3_ῖ3_ἰ3_ἱ3η2αῃ2αη2άη2άη2ὰη2ᾶῃ2άῃ2άῃ2ὰῃ2ᾶη2εῃ2εη2έη2έη2ὲῃ2έῃ2έῃ2ὲη2οῃ2οη2όη2όη2ὸῃ2όῃ2όῃ2ὸη2ωῃ2ωη2ώη2ώη2ὼη2ῶῃ2ώῃ2ώῃ2ὼῃ2ῶ_ή3_ή3_ῆ3_ἠ3_ἡ3υ2αυ2άυ2άυ2ὰυ2ᾶυ2ευ2έυ2έυ2ὲυ2ου2όυ2όυ2ὸυ2ωυ2ώυ2ώυ2ὼυ2ῶ_ύ3_ύ3_ῦ3_ὑ36β_6γ_6δ_6ζ_6θ_6κ_6λ_6μ_6ν_6ξ_6π_6ρ_6σ_6ϲ_6ς_6τ_6φ_6χ_6ψ_6β'6βʼ6β᾿6γ'6γʼ6γ᾿6δ'6δʼ6δ᾿6ζ'6ζʼ6ζ᾿6θ'6θʼ6θ᾿6κ'6κʼ6κ᾿6λ'6λʼ6λ᾿6μ'6μʼ6μ᾿6ν'6νʼ6ν᾿6ξ'6ξʼ6ξ᾿6π'6πʼ6π᾿6ρ'6ρʼ6ρ᾿6σ'6σʼ6σ᾿6ϲ'6ϲʼ6ϲ᾿6τ'6τʼ6τ᾿6φ'6φʼ6φ᾿6χ'6χʼ6χ᾿6ψ'6ψʼ6ψ᾿_β6_γ6_δ6_ζ6_θ6_κ6_λ6_μ6_ν6_ξ6_π6_ρ6_σ6_ϲ6_τ6_φ6_χ6_ψ6",
|
||||
4 : "ά3η_ά3η_ά3ι_ά3ι_ά3ϊ_ά3ϊ_ό2ειό2ειό3η_ό3η_ό3ι_ό3ι_ό3ϊ_ό3ϊ_6γκ_6μπ_6ντ_6τζ_6τσ_6τϲ_6τς_6μπ'6μπʼ6μπ᾿6ντ'6ντ’6ντ᾿6τζ'6τζʼ6τζ᾿6τσ'6τσʼ6τσ᾽6τϲ'6τϲʼ6τϲ᾿4β1β4γ1γ4δ1δ4ζ1ζ4θ1θ4κ1κ4λ1λ4μ1μ4ν1ν4π1π4ρ1ρ4ῤ1ῥ4σ1σ4ϲ1ϲ4τ1τ4φ1φ4χ1χ4ψ1ψ4β1ζ4β1θ4β1κ4β1μ4β1ν4β1ξ4β1π4β1σ4β1ϲ4β1τ4β1φ4β1χ4β1ψ4γ1β4γ1ζ4γ1θ4γ1κ4γ1μ4γ1ξ4γ1π4γ1σ4γ1ϲ4γ1τ4γ1φ4γ1χ4γ1ψ4δ1β4δ1γ4δ1ζ4δ1θ4δ1κ4δ1λ4δ1ξ4δ1π4δ1σ4δ1ϲ4δ1τ4δ1φ4δ1χ4δ1ψ4ζ1β4ζ1γ4ζ1δ4ζ1θ4ζ1κ4ζ1λ4ζ1μτζ2μ4ζ1ν4ζ1ξ4ζ1π4ζ1ρ4ζ1σ4ζ1ϲ4ζ1τ4ζ1φ4ζ1χ4ζ1ψ4θ1β4θ1γ4θ1δ4θ1ζ4θ1κ4θ1μσθ2μϲθ2μ4θ1ξ4θ1π4θ1σ4θ1ϲ4θ1τ4θ1φ4θ1χ4θ1ψ4κ1β4κ1γ4κ1δ4κ1ζ4κ1θ4κ1ξ4κ1π4κ1σ4κ1ϲ4κ1φ4κ1χ4κ1ψ4λ1β4λ1γ4λ1δ4λ1ζ4λ1θ4λ1κ4λ1μ4λ1ν4λ1ξ4λ1π4λ1ρ4λ1σ4λ1ϲ4λ1τ4λ1φ4λ1χ4λ1ψ4μ1β4μ1γ4μ1δ4μ1ζ4μ1θ4μ1κ4μ1λ4μ1ξ4μ1π4μ1ρ4μ1σ4μ1ϲ4μ1τ4μ1φ4μ1χ4μ1ψ4ν1β4ν1γ4ν1δ4ν1ζ4ν1θ4ν1κ4ν1λ4ν1μ4ν1ξ4ν1π4ν1ρ4ν1σ4ν1ϲ4ν1τ4ν1φ4ν1χ4ν1ψ4ξ1β4ξ1γ4ξ1δ4ξ1ζ4ξ1θ4ξ1κ4ξ1λ4ξ1μ4ξ1ν4ξ1π4ξ1ρ4ξ1σ4ξ1ϲ4ξ1τ4ξ1φ4ξ1χ4ξ1ψ4π1β4π1γ4π1δ4π1ζ4π1θ4π1κ4π1μ4π1ξ4π1σ4π1ϲ4π1φ4π1χ4π1ψ4ρ1β4ρ1γ4ρ1δ4ρ1ζ4ρ1θ4ρ1κ4ρ1λ4ρ1μ4ρ1ν4ρ1ξ4ρ1π4ρ1σ4ρ1ϲ4ρ1τ4ρ1φ4ρ1χ4ρ1ψ4σ1δ4ϲ1δ4σ1ζ4ϲ1ζ4σ1ν4ϲ1ν4σ1ξ4ϲ1ξ4σ1ρ4ϲ1ρ4σ1ψ4ϲ1ψ4τ1β4τ1γ4τ1δ4τ1θ4τ1ν4τ1ξ4τ1π4τ1φστ2φϲτ2φ4τ1χ4τ1ψ4φ1β4φ1γ4φ1δ4φ1ζ4φ1κ4φ1ν4φ1ξ4φ1π4φ1σ4φ1ϲ4φ1χ4φ1ψ4χ1β4χ1γ4χ1δ4χ1ζ4χ1κ4χ1μ4χ1ξ4χ1π4χ1σ4χ1ϲ4χ1φ4χ1ψ4ψ1β4ψ1γ4ψ1δ4ψ1ζ4ψ1θ4ψ1κ4ψ1λ4ψ1μ4ψ1ν4ψ1ξ4ψ1π4ψ1ρ4ψ1σ4ψ1ϲ4ψ1τ4ψ1φ4ψ1χβγ2κσγ2κϲγ2κσμ2πϲμ2πμν2τσν2τϲν2τ6βρ_6γλ_6κλ_6κτ_6κς_6κϲ_6κσ_6λς_6λϲ_6λσ_6μς_6μϲ_6μσ_6νς_6νϲ_6νσ_6ρς_6ρϲ_6ρσ_6σκ_6ϲκ_6στ_6ϲτ_6τλ_6τρ_6φτ_6χτ_",
|
||||
5 : "ο3ϊ3όο3ϊ3όο3ϊ3ὸβ5ν2τζ5ν2τλ5ν2τρ5ν2τ",
|
||||
6 : "4ρ5γ2μ4ρ5θ2μ4λ5κ2μ4ρ5κ2μ4γ5κ2φ4ν5κ2φ4γ5ξ2τ4ρ5ξ2τ4ρ5φ2ν4ρ5χ2μ4μ5ψ2τ4λ5γ2κ4ν5γ2κ4ρ5γ2κ4τ5γ2κ4ζ5μ2π4λ5μ2π4ν5μ2π4ρ5μ2πἄ5μ2ακἀ5μ2πρὄ5μ2ποὀ5μ2ποὀ5ν2τάὀ5ν2τάὀ5ν2τὰὀ5ν2τᾶ6μ2πλ_6μ2πν_6μ2πρ_",
|
||||
7 : "ἰ5γ2κου_ξε5γ2κ_ξέ5γ2κ_ξέ5γ2κ_σι5γ2κ_ϲι5γ2κἀ5μ2πάκἀ5μ2πάκἀ5μ2πανἀ5μ2πάρἀ5μ2πάρἀ5μ2πᾶρἀ5μ2παρἀρα5μ2πἰ5μ2πρα_κε5μ2π_λό5μ2π_λό5μ2π5μ2πέη_5μ2πέη_5μ2πεη_5μ2πογι_ξε5μ2π_ξέ5μ2π_ξέ5μ2π_ρε5μ2π_ρέ5μ2π_ρέ5μ2π_ρο5μ2πρό5μ2παρό5μ2παρό5μ2περό5μ2περό5μ2πωρό5μ2πωρο5μ2πῶρο5μ2παρο5μ2περο5μ2πωσό5μ2πασό5μ2παϲό5μ2παϲό5μ2πασό5μ2πεσό5μ2πεϲό5μ2πεϲό5μ2πεσο5μ2πῶϲο5μ2πῶσό5μ2πωσό5μ2πωϲό5μ2πωϲό5μ2πωσο5μ2παϲο5μ2πασο5μ2πεϲο5μ2πεσο5μ2πωϲο5μ2πω_τα5μ2π_χα5μ2π_χό5μ2π_χό5μ2π_ξε5ν2τ_ξέ5ν2τ_ξέ5ν2τ6γ2κ1τζ6γ2κ1τσ6γ2κ1τϲ6μ2π1τζ6μ2π1τσ6μ2π1τϲ6τσ5γ2κ6τϲ5γ2κ6τσ5μ2π6τϲ5μ2π6τσ5ν2τ6τϲ5ν2τ",
|
||||
8 : "ἐμι5γ2κρ_μπα5γ2κ_μπι5γ2κ_σπά5γ2κ_σπά5γ2κ_ϲπά5γ2κ_ϲπά5γ2κ_σπα5γ2κ_ϲπα5γ2κ_φιό5γ2κ_φιό5γ2κ_φιο5γ2κἀ6μ3πάριἀ6μ3πάριἀ6μ3παρι_γά5μ2πι_γά5μ2πι_γα5μ2πι_ζεϊ5μ2π_κό5μ2πρ_κό5μ2πρ_κο5μ2πρ_λι5μ2πρ5μ2πέης_5μ2πέης_5μ2πέηϲ_5μ2πέηϲ_5μ2πεης_5μ2πεηϲ_5μ2πέησ_5μ2πέησ_5μ2πεησ__μπι5μ2π_τρο6μ3π_τρό6μ3π_τρό6μ3π_ρου5μ2π_σέ5μ2πρ_σέ5μ2πρ_ϲέ5μ2πρ_ϲέ5μ2πρ_σνο5μ2π_ϲνο5μ2π_σού5μ2π_σού5μ2π_ϲού5μ2π_ϲού5μ2π_σου5μ2π_ϲου5μ2π_τζά5μ2π_τζά5μ2π_τζα5μ2π_τζι5μ2π_τό5μ2πρ_τό5μ2πρ_το5μ2πρ_φρα5μ2πἀ5ν2τάτζἀ5ν2τάτζ_βί5ν2τε_βί5ν2τε_βι5ν2τε_κα5ν2τρ_μαϊ5ν2τ_μπε5ν2τ_μπι5ν2τ_ντα5ν2τ5ν2τίβαν5ν2τίβαν_ρε5ν2τί_ρε5ν2τί_ρε5ν2τι_ροῦ5ν2τ_ρού5ν2τ_ρού5ν2τ_χα5ν2το_χα5ν2τρ_χά5ν2τρ_χά5ν2τρ6γ2κ5μ2π6γ2κ5ν2τ6μ2π5ν2τ6ν2τ5μ2π",
|
||||
9 : "5γ2κραντ_ἴντρι5γ2κἰντρι5γ2κ_μα5γ2κιό_μα5γ2κιό_ντά5γ2κλ_ντά5γ2κλ_ντα5γ2κλἀλα5μ2πουἀρλού5μ2πἀρλού5μ2πἀρλοῦ5μ2πἀρλου5μ2π_βό5μ2πιρ_βό5μ2πιρ_βο5μ2πιρ_κα5μ2πάδ_κα5μ2πάδ_κα5μ2πίν_κα5μ2πίν_κα5μ2πῖν_κα5μ2πιν_κά5μ2ποτ_κά5μ2ποτ_κα5μ2πότ_κα5μ2πότ_κα5μ2ποτ_καου5μ2π_καρα5μ2π5μ2πα5μ2π5μ2πά5μ2π5μ2πά5μ2π5μ2πέ5μ2π5μ2πέ5μ2π5μ2πε5μ2π_νό5μ2πελ_νό5μ2πελ_νο5μ2πελ_ντό5μ2πρ_ντό5μ2πρ_ντο5μ2πρ_σα2μ5ποτ_ϲα2μ5ποτ_τε5μ2πεσ_τε5μ2πεϲ_τζου5μ2π_τσά5μ2πα_τσά5μ2πα_τϲά5μ2πα_τϲά5μ2πα_τσα5μ2πα_τϲα5μ2παἀτρα5ν2τέἀτρα5ν2τέἀτρα5ν2τὲ_γιβε5ν2τ_γκάι5ν2τ_γκάι5ν2τ_γκάϊ5ν2τ_γκάϊ5ν2τ_γκαϊ5ν2τ_κα5ν2ταΐ_κα5ν2ταΐ_κα5ν2ταϊ_μα5ν2τάμ_μα5ν2τάμ_μα5ν2τὰμ_μα5ν2ταμ_μα5ν2τέμ_μα5ν2τέμ_μα5ν2τεμ_μεϊ5ν2τά_μεϊ5ν2τά_μεϊ5ν2τα_μο5ν2τέλ_μο5ν2τέλ_μο5ν2τελμο5ν2τέρνμο5ν2τέρνμο5ν2τερν_νισα5ν2τ_νιϲα5ν2τ_ρεζε5ν2τ_σε5ν2τέφ_σε5ν2τέφ_ϲε5ν2τέφ_ϲε5ν2τέφ_σε5ν2τεφ_ϲε5ν2τεφ_σε5ν2τοῦ_ϲε5ν2τοῦ_σε5ν2τού_σε5ν2τού_ϲε5ν2τού_ϲε5ν2τού_σε5ν2του_ϲε5ν2του_τσα5ν2τί_τσα5ν2τί_τϲα5ν2τί_τϲα5ν2τί_τσα5ν2τι_τϲα5ν2τι",
|
||||
10 : "_γιου5γ2κο_καρα5γ2κι_χούλι5γ2κ_χούλι5γ2κ_χουλι5γ2κ_γιαρα5μ2π_καλα5μ2πα_καλί5μ2πρ_καλί5μ2πρ_καλι5μ2πρ_κα5μ2παρέ_κα5μ2παρέ_κα5μ2παρὲ_κα5μ2παρε_καρνα5μ2π_κολι5μ2πρ_κου5μ2πού_κου5μ2πού_κου5μ2ποῦ_κου5μ2που5μ2πέηδες_5μ2πέηδες_5μ2πέηδεϲ_5μ2πέηδεϲ_5μ2πέηδεσ_5μ2πέηδεσ_5μ2πέηδων_5μ2πέηδων__μπό5μ2πιρ_μπό5μ2πιρ_μπο5μ2πιρ_μπο5μ2πότ_μπο5μ2πότ_μπο5μ2ποτ_σκα5μ2παβ_ϲκα5μ2παβ_ταβλα5μ2π_τζανα5μ2π_τρα5μ2πάλ_τρα5μ2πάλ_τρα5μ2παλ_φά5μ2πρικ_φά5μ2πρικ_φα5μ2πρικ_μπαλά5ν2τ_μπαλά5ν2τ_μπαλα5ν2τ_μπα5ν2ταν_μπου5ν2τα_μπου5ν2τρ",
|
||||
11 : "_καρα6μ3πόλ_καρα6μ3πόλ_καρα6μ3πολ_κολού5μ2πρ_κολού5μ2πρ_κολοῦ5μ2πρ_κολου5μ2πρ_κο6μ3πρέσσ_κο6μ3πρέσσ_κο6μ3πρέϲϲ_κο6μ3πρέϲϲ_κο6μ3πρεσσ_κο6μ3πρεϲϲ_κοντρα5μ2π_κωλού5μ2πρ_κωλού5μ2πρ_κωλοῦ5μ2πρ_κωλου5μ2πρ_μανιτό5μ2π_μανιτό5μ2π_μπα6μ3πάκι_μπα6μ3πάκι_μπα6μ3πακι_ρεπού5μ2πλ_ρεπού5μ2πλ_ρεπου5μ2πλ_τα6μ3περαμ_τα6μ3ποῦρλ_τα6μ3πούρλ_τα6μ3πούρλ_τρα5μ2ποῦκ_τρα5μ2πούκ_τρα5μ2πούκ_τρα5μ2πουκ_τσι5μ2πούκ_τσι5μ2πούκ_τϲι5μ2πούκ_τϲι5μ2πούκ_τσι5μ2πουκ_τϲι5μ2πουκ_τσι5μ2πούσ_τσι5μ2πούσ_τϲι5μ2πούϲ_τϲι5μ2πούϲ_τσι5μ2πουσ_τϲι5μ2πουϲ_γιαχου5ν2τ",
|
||||
12 : "_σαλτιπά5γ2κ_σαλτιπά5γ2κ_ϲαλτιπά5γ2κ_ϲαλτιπά5γ2κ_κουλού5μ2πρ_κουλού5μ2πρ_κουλοῦ5μ2πρ_κουλου5μ2πρ_μπου5μ2πούν_μπου5μ2πούν_μπου5μ2ποῦν_μπου5μ2πουν_χοντρο5μ2πα_λικβι5ν2ταρ_ντερμπε5ν2τ_ντου5ν2τούκ_ντου5ν2τούκ_ντου5ν2τοῦκ_ντου5ν2τουκ_φαστφου5ν2τ_φαϲτφου5ν2τ",
|
||||
13 : "_μπασκε2τ5μ2π_μπαϲκε2τ5μ2π_μπασι5μ2πουζ_μπαϲι5μ2πουζ"
|
||||
}
|
||||
};
|
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
@ -1,17 +0,0 @@
|
||||
Hyphenator.languages['fi'] = {
|
||||
leftmin : 2,
|
||||
rightmin : 2,
|
||||
shortestPattern : 2,
|
||||
longestPattern : 7,
|
||||
specialChars : "öäå",
|
||||
patterns : {
|
||||
3 : "1ba1be1bi1bo1bu1by1da1de1di1do1du1dy1dä1dö1fa1fe1fi1fo1fu1fy1ga1ge1gi1go1gu1gy1gä1gö1ha1he1hi1ho1hu1hy1hä1hö1ja1je1ji1jo1ju1jy1jä1jö1ka1ke1ki1ko1ku1ky1kä1kö1la1le1li1lo1lu1ly1lä1lö1ma1me1mi1mo1mu1my1mä1mö1na1ne1ni1no1nu1ny1nä1nö1pa1pe1pi1po1pu1py1pä1pö1ra1re1ri1ro1ru1ry1rä1rö1sa1se1si1so1su1sy1sä1sö1ta1te1ti1to1tu1ty1tä1tö1va1ve1vi1vo1vu1vy1vä1vöä2yo1yö2ya1äa1öo1äo1öä2äö2öä2öö2ä_ä2u2sb2lb2rd2rf2lf2rg2lg2rk2lp2lp2rc2lq2v",
|
||||
4 : "y1a2y1o2u1y2y1u2ö3a2ö3o2ä3a2ä3o2ä1u2ö1u2u1ä2u1ö2e1aai1aao1aau1aau1eea1uui1uue1uuo1uuää1iää1eää3yi1ääe1ääy1ääi1ööa1eia1oie1aii1auy1eiai1aai1eai1oai1uau1aau1eeu1aie1aie1oie1yiu1aiu1eiu1ooi1aoi1eoi1ooi1uo1uiou1eou1oue1aui1euo1auo1ue1ö2ö1e2r2asl2as1k2vsc2hts2h",
|
||||
5 : "1st2raa1i2aa1e2aa1o2aa1u2ee1a2ee1i2ee1u2ee1y2ii1a2ii1e2ii1o2uu1a2uu1e2uu1o2uu1i2io1a2io1e2keus11b2lo1b2ri1b2ro1b2ru1d2ra1f2la1f2ra1f2re1g2lo1g2ra1k2ra1k2re1k2ri1k2va1p2ro1q2vich2r",
|
||||
6 : "1sp2lialous1rtaus1perus12s1ase2s1apuulo2s1bib3li",
|
||||
7 : "yli1o2pali1a2v2s1ohje1a2sian1a2siat1a2sioi2s1o2sa2n1o2sa_ydi2n12n1otto2n1oton2n1anto2n1anno2n1aika2n1a2jo2s1a2jo",
|
||||
8 : "2s1a2sia2n1o2pet2s1a2loialkei2s12n1e2dus2s1ajatu2s1y2rit2s1y2hti2n1a2jan2n1o2mai2n1y2lit2s1a2len2n1a2len",
|
||||
9 : "2s1o2pisk2n1o2pist2s1o2pist2s1i2dea_2s1i2dean2s1e2sity_suu2r1a2",
|
||||
11 : "1a2siaka2s1"
|
||||
}
|
||||
};
|
@ -1,26 +0,0 @@
|
||||
// The french hyphenation patterns are retrieved from
|
||||
// http://tug_org/svn/texhyphen/trunk/collaboration/repository/hyphenator/
|
||||
Hyphenator.languages['fr'] = {
|
||||
leftmin : 2,
|
||||
rightmin : 3,
|
||||
shortestPattern : 1,
|
||||
longestPattern : 14,
|
||||
specialChars : "àâçèéêîïôûœ’'",
|
||||
patterns : {
|
||||
2 : "1ç1j1q",
|
||||
3 : "1gè’â41zu1zo1zi1zè1zé1ze1za’y4_y41wu1wo1wi1we1wa1vy1vû1vu1vô1vo1vî1vi1vê1vè1vé1ve1vâ1va’û4_û4’u4_u41ba1bâ1ty1be1bé1bè1bê1tû1tu1tô1bi1bî1to1tî1ti1tê1tè1té1te1tà1tâ1ta1bo1bô1sy1sû1su1sœ1bu1bû1by2’21ca1câ1sô1ce1cé1cè1cê1so1sî1si1sê1sè1sé1se1sâ1sa1ry1rû1ru1rô1ro1rî1ri1rê1rè1ré1re1râ1ra’a41py1pû1pu1pô1po1pî1pi1pê1pè1pé1pe1pâ1pa_ô41ci1cî’ô4’o4_o41nyn1x1nû1nu1nœ1nô1no1nî1ni1nê1nè1né1ne1nâ1co1cô1na1my1mû1mu1mœ1mô1mo1mî1mi1cœ1mê1mè1mé1me1mâ1ma1ly1lû1lu1lô1lo1lî1li1lê1lè1cu1cû1cy1lé1d’1da1dâ1le1là1de1dé1dè1dê1lâ1la1ky1kû1ku1kô1ko1kî1ki1kê1kè1ké1ke1kâ1ka2jk_a4’î4_î4’i4_i41hy1hû1hu1hô1ho1hî1hi1hê1hè1hé1he1hâ1ha1gy1gû1gu1gô1go1gî1gi1gê_â41gé1ge1gâ1ga1fy1di1dî1fû1fu1fô1fo’e41fî1fi1fê1fè1do1dô1fé1fe1fâ1fa’è41du1dû1dy_è4’é4_é4’ê4_ê4_e41zy",
|
||||
4 : "1f2lab2h2ckg2ckp2cksd1s22ckb4ck_1c2k2chw4ze_4ne_2ckt1c2lad2hm1s22cht2chsch2r2chp4pe_1t2r1p2h_ph44ph_ph2l2phnph2r2phs1d2r2pht2chn4fe_2chm1p2l1p2r4me_1w2rch2l2chg1c2r2chb4ch_1f2r4le_4re_4de_f1s21k2r4we_1r2h_kh44kh_1k2h4ke_1c2h_ch44ge_4je_4se_1v2r_sh41s2h4ve_4sh_2shm2shr2shs4ce_il2l1b2r4be_1b2l4he_4te__th41t2h4th_g1s21g2r2thl1g2l2thm2thnth2r1g2n2ths2ckf",
|
||||
5 : "2ck3h4rhe_4kes_4wes_4res_4cke_éd2hi4vre_4jes_4tre_4zes_4ges_4des_i1oxy4gle_d1d2h_cul44gne_4fre_o1d2l_sch44nes_4les_4gre_1s2ch_réu24sch_4the_1g2hy4gue_2schs4cle_1g2ho1g2hi1g2he4ses_4tes_1g2ha4ves_4she_4che_4cre_4ces_t1t2l4hes_l1s2t4bes_4ble__con4xil3lco1ap4que_vil3l4fle_co1arco1exco1enco1auco1axco1ef4pes_co1é2per3h4mes__pe4r4bre_4pre_4phe_1p2né4ple__dé2smil3llil3lhil3l4dre_cil3lgil3l4fes_",
|
||||
6 : "’in1o2rcil4l4phre_4dres_l3lioni1algi2fent_émil4l4phle_rmil4l4ples_4phes_1p2neuextra14pres_y1asthpé2nul2xent__mé2sa2pent_y1algi4chre_1m2nès4bres_1p2tèr1p2tér4chle_’en1o24fles_oxy1a2avil4l_en1o24ques_uvil4lco1a2d4bles__in1a2’in1a21s2por_cons4_bi1u2’as2ta_in1e2’in1e2_in1é2’in1é21s2lov1s2lavco1acq2cent__as2ta_co1o24ches_hémi1é_in2er’in2er2s3homo1ioni_in1i2’in1i22went_4shes__ré1a2_ré1é2_ré1e2_ré2el_in1o2ucil4lco1accu2s3tr_ré2er_ré2èr4cles_2vent__ré1i22sent_2tent_2gent__ré1o24gues__re1s24sche_4thes_’en1a2e2s3ch4gres_1s2cop2lent__en1a22nent__in1u2’in1u24gnes_4cres_wa2g3n4fres_4tres_4gles_1octet_dé1o2_dé1io4thre__bi1au2jent__dé1a22zent_4vres_2dent_4ckes_4rhes__dy2s3sub1s22kent_2rent_2bent_3d2hal",
|
||||
7 : "a2g3nos3d2houdé3rent__dé3s2t_dé3s2pé3dent_2r3heur2r3hydri1s2tat2frent_io1a2ctla2w3re’in2u3l_in2u3l2crent_’in2uit_in2uit1s2caph1s2clér_ré2ussi2s3ché_re2s3t_re2s3s4sches_é3cent__seu2le’in2ond_in2ond’in2i3t_in2i3t’in2i3q_ré2aux_in2i3q2shent__di1alduni1a2x’in2ept2flent__in2eptuni1o2v2brent_co2nurb2chent_2quent_1s2perm1s2phèr_ma2c3kuevil4l1s2phér1s2piel1s2tein1s2tigm4chles_1s2tock1s2tyle1p2sych_pro1é2_ma2r1x_stil3lpusil3libril3lcyril3l_pré1s2thril3l_mé3san_pré1u2_mé2s1i_pré1o2_pré1i2piril3lpupil3lâ2ment__pré1e2_pré1é2_pré2au_pré1a22prent_2vrent_supero2_di1e2npoly1u2è2ment_poly1s2poly1o2poly1i2poly1è2poly1é2poly1e2poly1a2supe4r1capil3l2plent_armil5lsemil4lmil4letvacil4l_di2s3h3ph2tis2dlent_a2s3tro4phres_l2ment_i1è2drei1arthr2drent_4phles_supers2ô2ment_extra2i2phent_su3r2ah_su2r3hextra2chypo1u21alcool_per1u2_per1o2_per1i2_per1é2hypo1s2_per1a2hypo1o2hypo1i2hypo1é2_pen2tahypo1e2hypo1a2y1s2tome2s3cophyperu2hype4r1hypers2hypero21m2némohyperi21m2nési4chres_a1è2drehyperé2hypere2hypera2’oua1ou_oua1ouo1s2tomo1s2timo1s2tato1s2tasomni1s2tung2s3_dé3s2c2blent__bio1a2télé1e2télé1i22clent_télé1s22guent_1é2nerg2grent_2trent__dé2s1œ2t3heuro1è2dre2gnent_2glent_4thres__bi1a2t1é2drie_bi1a2c_i2g3nin3s2at_’i2g3ni2ckent__i2g3né’ab3réa’i2g3né_ab3réa_per1e2",
|
||||
8 : "_ma2l1ap_dy2s1u2_dy2s1o2_dy2s1i2n3s2ats__dy2s1a2distil3l1é2lectrinstil3l1s2trophe2n1i2vro2b3long1s2tomos_ae3s4ch’ae3s4ch_eu2r1a2ombud2s3’eu2r1a2_mono1s2_mono1u2o1s2téro_mono1o2eu1s2tato1s2tradfritil3la2l1algi_mono1i2_mono1é2_ovi1s2c’ovi1s2c_mono1e2_mono1a2co1assocpaléo1é2boutil3l1s2piros_ré2i3fi_pa2n1ischevil4l1s2patiaca3ou3t2_di1a2cé_para1s2_pa2r3héco1assur_su2b1é2tu2ment_su2ment__su2b1in_su2b3lupapil3lire3pent_’inte4r3_su2b1urab3sent__su2b1a2di2s3cophu2ment_fu2ment__intera2au2ment_as2ment_or2ment_’intera2_intere2pé1r2é2q_péri1os_péri1s2ja3cent__anti1a2_péri1u2’anti1a2er2ment__anti1e2ac3cent_ar2ment_to2ment_’intere2ré3gent_papil3leom2ment_’anti1e2photo1s2_anti1é2_interé2’anti1é2_anti1s2’anti1s23ph2talé’interé2ri2ment__interi2’interi2mi2ment_apo2s3tri2s3chio_pluri1ai2s3chia_intero2’intero2_inte4r3po1astre_interu2’interu2_inters2ai2ment_’inters2papil3la_tri1o2n_su2r1a2_pon2tet_pos2t3h_dés2a3mes3cent__pos2t3r_post1s2_tri1a2tta2ment__tri1a2nra2ment_is3cent__su2r1e2_tri1a2cfa2ment_da2ment__su3r2et_su2r1é2_mé2s1es_mé2g1oh_su2r1of_su2r1ox_re3s4ty_re3s4tu_ma2l1oc’a2g3nat_dé2s1é2_ma2l1entachy1a2_pud1d2ltchin3t2_re3s4trtran2s3p_bi2s1a2tran2s3hhémo1p2té3quent__a2g3nat_dé2s1i2télé1o2bo2g3nosiradio1a2télé1o2ppu2g3nacru3lent__sta2g3nre3lent__ré2a3le_di1a2mi",
|
||||
9 : "_ré2a3lit_dé3s2o3lthermo1s2_dé3s2ist_dé3s2i3rmit3tent_éni3tent__do3lent__ré2a3lisopu3lent__pa3tent__re2s3cap_la3tent__co2o3lie_re2s3cou_re2s3cri_ma2g3num_re2s3pir_dé3s2i3dco2g3nititran2s1a2tran2s1o2_dé3s2exu_re3s4tab_re3s4tag_dé3s2ert_re3s4tat_re3s4tén_re3s4tér_re3s4tim_re3s4tip_re3s4toc_re3s4toptran2s1u2_no2n1obs_ma2l1a2v_ma2l1int_prou3d2hpro2s3tativa3lent__ta3lent__rétro1a2_pro1s2cé_ma2l1o2dcci3dent__pa3rent__su2r1int_su2r1inf_su2r1i2mtor3rent_cur3rent__mé2s1u2stri3dent__dé3s2orm_su3r2ell_ar3dent__su3r2eaupru3dent__pré2a3lacla2ment__su3r2a3t_pos2t1o2_pos2t1inqua2ment_ter3gent_ser3gent_rai3ment_abî2ment_éci2ment_’ar3gent__ar3gent_rin3gent_tan3gent_éli2ment_ani2ment_’apo2s3ta_apo2s3tavélo1s2kivol2t1amp_dé3s2orp_dé2s1u2n_péri2s3ssesqui1a2’ana3s4trfir2ment_écu2ment_ser3pent_pré3sent_’ar3pent__ar3pent_’in1s2tab_in1s2tab’in2o3cul_in2o3culplu2ment_bou2ment_’in2exora_in2exora_su2b3linbru2ment__su3b2é3r_milli1am’in2effab_in2effab’in2augur_di1a2cid_in2augur_pa2n1opt’in2a3nit_in2a3nit1informat_ana3s4trvanil3lis_di1a2tom_su3b2altvanil3linstéréo1s2_pa2n1a2fo1s2tratuépi2s3cop_ci2s1alp1s2tructu1é2lément1é2driquepapil3lomllu2ment_",
|
||||
10 : "1s2tandardimmi3nent__émi3nent_imma3nent_réma3nent_épi3s4cope_in2i3miti’in2i3miti_res3sent_moye2n1â2gréti3cent__dé3s2a3crmon2t3réalinno3cent__mono1ï2dé_pa2n1a2méimpu3dent__pa2n1a2ra_amino1a2c’amino1a2c_pa2n1o2phinci3dent__ser3ment_appa3rent_déca3dent__dacryo1a2_dé3s2astr_re4s5trin_dé3s2é3gr_péri2s3ta_sar3ment__dé3s2oufr_re3s4tandchro2ment__com3ment__re2s3quil_re2s3pons_gem2ment__re2s3pect_re2s3ciso_dé3s2i3gn_dé3s2i3ligram2ment__dé3s2invo_re2s3cisitran3s2act’anti2enneindo3lent__sou3vent_indi3gent_dili3gent_flam2ment_impo3tent_inso3lent_esti2ment_’on3guent__on3guent_inti2ment__dé3s2o3défécu3lent_veni2ment_reli2ment_vidi2ment_chlo2r3é2tpu2g3nablechlo2r3a2cryth2ment_o2g3nomonicarê2ment__méta1s2ta_ma2l1aisé_macro1s2célo3quent_tran3s2ats_anti2enne",
|
||||
11 : "_contre1s2cperti3nent_conti3nent__ma2l1a2dro_in2é3lucta_psycho1a2n_dé3s2o3pil’in2é3luctaperma3nent__in2é3narratesta3ment__su2b3liminrésur3gent_’in2é3narraimmis4cent__pro2g3nathchien3dent_sporu4lent_dissi3dent_corpu3lent_archi1é2pissubli2ment_indul3gent_confi3dent__syn2g3nathtrucu3lent_détri3ment_nutri3ment_succu3lent_turbu3lent__pa2r1a2che_pa2r1a2chèfichu3ment_entre3gent_conni3vent_mécon3tent_compé3tent__re4s5trict_dé3s2i3nen_re2s3plend1a2nesthésislalo2ment__dé3s2ensib_re4s5trein_phalan3s2tabsti3nent_",
|
||||
12 : "polyva3lent_équiva4lent_monova3lent_amalga2ment_omnipo3tent__ma2l1a2dreséquipo3tent__dé3s2a3tellproémi3nent_contin3gent_munifi3cent__ma2g3nicideo1s2trictionsurémi3nent_préémi3nent__bai2se3main",
|
||||
13 : "acquies4cent_intelli3gent_tempéra3ment_transpa3rent__ma2g3nificatantifer3ment_",
|
||||
14 : "privatdo3cent_diaphrag2ment_privatdo3zent_ventripo3tent__contre3maître",
|
||||
15 : "grandilo3quent_",
|
||||
16 : "_chè2vre3feuille"
|
||||
}
|
||||
};
|
File diff suppressed because one or more lines are too long
@ -1,12 +0,0 @@
|
||||
// For questions about the Gujarati hyphenation patterns
|
||||
// ask Santhosh Thottingal (santhosh dot thottingal at gmail dot com)
|
||||
Hyphenator.languages['gu'] = {
|
||||
leftmin : 2,
|
||||
rightmin : 2,
|
||||
shortestPattern : 1,
|
||||
longestPattern : 1,
|
||||
specialChars : unescape("આઅઇઈઉઊઋએઐઔકગખઘઙચછજઝઞટઠડઢણતથદધનપફબભમયરલવશષસહળિીાુૂૃેાોૈૌ્ઃં%u200D"),
|
||||
patterns : {
|
||||
2 : "અ1આ1ઇ1ઈ1ઉ1ઊ1ઋ1એ1ઐ1ઔ1િ1ા1ી1ુ1ૂ1ૃ1ે1ો1ૌ1્2ઃ1ં11ક1ગ1ખ1ઘ1ઙ1ચ1છ1જ1ઝ1ઞ1ટ1ઠ1ડ1ઢ1ણ1ત1થ1દ1ધ1ન1પ1ફ1બ1ભ1મ1ય1ર1લ1વ1શ1ષ1સ1હ1ળ"
|
||||
}
|
||||
};
|
@ -1,12 +0,0 @@
|
||||
// For questions about the Hindi hyphenation patterns
|
||||
// ask Santhosh Thottingal (santhosh dot thottingal at gmail dot com)
|
||||
Hyphenator.languages['hi'] = {
|
||||
leftmin : 2,
|
||||
rightmin : 2,
|
||||
shortestPattern : 1,
|
||||
longestPattern : 1,
|
||||
specialChars : unescape("आअइईउऊऋऎएऐऒऔकगखघङचछजझञटठडढणतथदधनपफबभमयरलवशषसहळऴऱिीाुूृॆेॊाोैौ्ःं%u200D"),
|
||||
patterns : {
|
||||
2 : "अ1आ1इ1ई1उ1ऊ1ऋ1ऎ1ए1ऐ1ऒ1औ1ि1ा1ी1ु1ू1ृ1ॆ1े1ॊ1ो1ौ1्2ः1ं11क1ग1ख1घ1ङ1च1छ1ज1झ1ञ1ट1ठ1ड1ढ1ण1त1थ1द1ध1न1प1फ1ब1भ1म1य1र1ल1व1श1ष1स1ह1ळ1ऴ1ऱ"
|
||||
}
|
||||
};
|
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
@ -1,20 +0,0 @@
|
||||
Hyphenator.languages['it'] = {
|
||||
leftmin : 2,
|
||||
rightmin : 2,
|
||||
shortestPattern : 1,
|
||||
longestPattern : 8,
|
||||
specialChars : "àéèìòù’'",
|
||||
// The italian hyphenation patterns are retrieved from
|
||||
// http://www.ctan.org/tex-archive/language/hyphenation/ithyph.tex
|
||||
patterns : {
|
||||
2 : "1b1c1d1f1g1h1j1k1l1m1n1p1q1r1t1v1w1x1z",
|
||||
3 : "2’2e2w2bb2bc2bd2bf2bm2bn2bp2bs2bt2bvb2lb2r2b_2b’2cb2cc2cd2cf2ck2cm2cn2cq2cs2ct2czc2hc2lc2r2c_2c’_c22db2dd2dg2dl2dm2dn2dpd2r2ds2dt2dv2dw2d_2d’_d22fb2fg2ff2fnf2lf2r2fs2ft2f_2f’2gb2gd2gf2ggg2hg2l2gmg2n2gpg2r2gs2gt2gv2gw2gz2g_2g’2hb2hd2hhh2l2hm2hn2hr2hv2h_2h’2j_2j’2kg2kfk2h2kkk2l2kmk2r2ks2kt2k_2k’2lb2lc2ld2lgl2h2lk2ll2lm2ln2lp2lq2lr2ls2lt2lv2lw2lz2l_2mb2mc2mf2ml2mm2mn2mp2mq2mr2ms2mt2mv2mw2m_2m’2nb2nc2nd2nf2ng2nk2nl2nm2nn2np2nq2nr2ns2nt2nv2nz2n_2n’2pdp2hp2l2pn2ppp2r2ps2pt2pz2p_2p’2qq2q_2q’2rb2rc2rd2rfr2h2rg2rk2rl2rm2rn2rp2rq2rr2rs2rt2rv2rx2rw2rz2r_2r’1s22sz4s_2tb2tc2td2tf2tgt2ht2l2tm2tn2tpt2rt2s2tt2tv2twt2z2t_2vcv2lv2r2vv2v_w2h2w_2w’2xb2xc2xf2xh2xm2xp2xt2xw2x_2x’y1i2zb2zd2zl2zn2zp2zt2zs2zv2zz2z_",
|
||||
4 : "_p2sa1iaa1iea1ioa1iua1uoa1ya2at_e1iuo1iao1ieo1ioo1iu2chh2chbch2r2chn2l’_2l’’2shm2sh_2sh’2s3s2stb2stc2std2stf2stg2stm2stn2stp2sts2stt2stv4s’_4s’’2tzktz2s2t’_2t’’2v’_2v’’wa2r2w1yy1ou2z’_2z’’_z2",
|
||||
5 : "_bio1_pre12gh2t2l3f2n2g3n3p2nes4s3mt2t3s",
|
||||
6 : "_a3p2n_anti1_free3_opto1_para1hi3p2n2nheit3p2sicr2t2s32s3p2n3t2sch",
|
||||
7 : "_ca4p3s_e2x1eu_narco1_su2b3r_wa2g3n_wel2t1n2s3fer",
|
||||
8 : "_contro1_fran2k3_li3p2sa_orto3p2_poli3p2_sha2re3_su2b3lu",
|
||||
9 : "_anti3m2n_circu2m1_re1i2scr_tran2s3c_tran2s3d_tran2s3l_tran2s3n_tran2s3p_tran2s3r_tran2s3t",
|
||||
10 : "_di2s3cine"
|
||||
}
|
||||
};
|
@ -1,13 +0,0 @@
|
||||
// For questions about the Kannada hyphenation patterns
|
||||
// ask Santhosh Thottingal (santhosh dot thottingal at gmail dot com)
|
||||
Hyphenator.languages['kn'] = {
|
||||
leftmin : 2,
|
||||
rightmin : 2,
|
||||
shortestPattern : 1,
|
||||
longestPattern : 1,
|
||||
specialChars : "ಆಅಇಈಉಊಋಎಏಐಒಔಕಗಖಘಙಚಛಜಝಞಟಠಡಢಣತಥದಧನಪಫಬಭಮಯರಲವಶಷಸಹಳಱಿೀಾುೂೃೆೇೊಾೋೈೌ್ಃಂ",
|
||||
patterns : {
|
||||
2 : "ಅ1ಆ1ಇ1ಈ1ಉ1ಊ1ಋ1ಎ1ಏ1ಐ1ಒ1ಔ1ೀ1ು1ೂ1ೃ1ೆ1ೇ1ೊ1ೋ1ೌ1್2ಃ1ಂ11ಕ1ಗ1ಖ1ಘ1ಙ1ಚ1ಛ1ಜ1ಝ1ಞ1ಟ1ಠ1ಡ1ಢ1ಣ1ತ1ಥ1ದ1ಧ1ನ1ಪ1ಫ1ಬ1ಭ1ಮ1ಯ1ರ1ಲ1ವ1ಶ1ಷ1ಸ1ಹ1ಳ1ಱ",
|
||||
3 : "2ಃ12ಂ1"
|
||||
}
|
||||
};
|
@ -1,22 +0,0 @@
|
||||
// Latin hyphenation patterns converted by
|
||||
// Pablo Rodríguez (hyphenator at pragmata dot tk)
|
||||
// based on LaTeX Latin hyphenation patterns by Claudio Beccari
|
||||
// (http://tug.ctan.org/tex-archive/language/hyphenation/lahyph.tex)
|
||||
Hyphenator.languages['la'] = {
|
||||
leftmin : 2,
|
||||
rightmin : 2,
|
||||
shortestPattern : 1,
|
||||
longestPattern : 8,
|
||||
specialChars : "æœ",
|
||||
patterns : {
|
||||
2 : "æ1œ11b1c1d1f1g1h1j1k1l1m1n1p1r1t1v1x1z",
|
||||
3 : "2bb2bdb2l2bm2bnb2r2bt2bs2b_2ccc2l2cm2cn2cqc2r2cs2ct2cz2c_2dd2dg2dmd2r2ds2dv2d_2fff2l2fnf2r2ft2f_2gg2gd2gfg2l2gmg2ng2r2gs2gv2g_2hp2ht2h_2kk2lb2lc2ld2lf2lg2lk2ll2lm2ln2lp2lq2lr2ls2lt2lv2l_2mm2mb2mp2ml2mn2mq2mr2mv2m_2nb2nc2nd2nf2ng2nl2nm2nn2np2nq2nr2ns2nt2nv2nx2n_p2hp2l2pn2ppp2r2ps2pt2pz2p_2rb2rc2rd2rf2rgr2h2rl2rm2rn2rp2rq2rr2rs2rt2rv2rz2r_1s22s_2tb2tc2td2tf2tgt2ht2lt2r2tm2tn2tp2tq2tt2tv2t_v2lv2r2vv2xt2xx2x_2z_",
|
||||
4 : "a1iaa1iea1ioa1iuae1aae1oae1ue1iuio1io1iao1ieo1ioo1iuuo3uc2h2k2h22php2pht1qu22s3s2stb2stc2std2stf2stg2stm2stn2stp2stq2sts2stt2stv2st_a1uaa1uea1uia1uoa1uue1uae1uee1uie1uoe1uui1uai1uei1uii1uoi1uuo1uao1ueo1uio1uoo1uuu1uau1ueu1uiu1uou1uu",
|
||||
5 : "_e2x1_o2b3l3f2tn2s3mn2s3f2s3ph2st3l",
|
||||
6 : "_a2b3l_anti13p2sic3p2neua2l1uaa2l1uea2l1uia2l1uoa2l1uue2l1uae2l1uee2l1uie2l1uoe2l1uui2l1uai2l1uei2l1uii2l1uoi2l1uuo2l1uao2l1ueo2l1uio2l1uoo2l1uuu2l1uau2l1ueu2l1uiu2l1uou2l1uua2m1uaa2m1uea2m1uia2m1uoa2m1uue2m1uae2m1uee2m1uie2m1uoe2m1uui2m1uai2m1uei2m1uii2m1uoi2m1uuo2m1uao2m1ueo2m1uio2m1uoo2m1uuu2m1uau2m1ueu2m1uiu2m1uou2m1uua2n1uaa2n1uea2n1uia2n1uoa2n1uue2n1uae2n1uee2n1uie2n1uoe2n1uui2n1uai2n1uei2n1uii2n1uoi2n1uuo2n1uao2n1ueo2n1uio2n1uoo2n1uuu2n1uau2n1ueu2n1uiu2n1uou2n1uua2r1uaa2r1uea2r1uia2r1uoa2r1uue2r1uae2r1uee2r1uie2r1uoe2r1uui2r1uai2r1uei2r1uii2r1uoi2r1uuo2r1uao2r1ueo2r1uio2r1uoo2r1uuu2r1uau2r1ueu2r1uiu2r1uou2r1uu",
|
||||
7 : "_para1i_para1u_su2b3r2s3que_2s3dem_",
|
||||
8 : "_su2b3lu",
|
||||
9 : "_anti3m2n_circu2m1_co2n1iun",
|
||||
10 : "_di2s3cine"
|
||||
}
|
||||
};
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user